From 500404ebcbd074ca11aa0c3fd9a268aa4054fd8b Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 3 Nov 2015 12:28:10 +0200
Subject: dmaengine: of_dma: Correct return code for
 of_dma_request_slave_channel in case !CONFIG_OF

of_dma_request_slave_channel should return either pointer for valid
dma_chan or ERR_PTR() error code, NULL is not expected to be returned.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/of_dma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h
index 36112cdd665a..b90d8ec57c1f 100644
--- a/include/linux/of_dma.h
+++ b/include/linux/of_dma.h
@@ -80,7 +80,7 @@ static inline int of_dma_router_register(struct device_node *np,
 static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
 						     const char *name)
 {
-	return NULL;
+	return ERR_PTR(-ENODEV);
 }
 
 static inline struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
-- 
cgit v1.2.3-71-gd317


From aedf17f4515b12ba1cd73298e66baa69cf93010e Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Mon, 16 Nov 2015 15:34:36 +0100
Subject: lightnvm: change max_phys_sect to uint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The max_phys_sect variable is defined as a char. We do a boundary check
to maximally allow 256 physical page descriptors per command. As we are
not indexing from zero. This expression is always false. Bump the
max_phys_sect to an unsigned int to support the range check.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/lightnvm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 69c9057e1ab8..32b5369e814e 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -220,7 +220,7 @@ struct nvm_dev_ops {
 	nvm_dev_dma_alloc_fn	*dev_dma_alloc;
 	nvm_dev_dma_free_fn	*dev_dma_free;
 
-	uint8_t			max_phys_sect;
+	unsigned int		max_phys_sect;
 };
 
 struct nvm_lun {
-- 
cgit v1.2.3-71-gd317


From 11450469830f2481a9e7cb181609288d40f41323 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Mon, 16 Nov 2015 15:34:37 +0100
Subject: lightnvm: update bad block table format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The specification was changed to reflect a multi-value bad block table.
Instead of bit-based bad block table, the bad block table now allows
eight bad block categories. Currently four are defined:

 * Factory bad blocks
 * Grown bad blocks
 * Device-side reserved blocks
 * Host-side reserved blocks

The factory and grown bad blocks are the regular bad blocks. The
reserved blocks are either for internal use or external use. In
particular, the device-side reserved blocks allows the host to
bootstrap from a limited number of flash blocks. Reducing the flash
blocks to scan upon super block initialization.

Support for both get bad block table and set bad block table is added.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c    |  32 ++++++++----
 drivers/lightnvm/gennvm.h    |   2 +
 drivers/nvme/host/lightnvm.c | 113 ++++++++++++++++++++++++++++++++++---------
 include/linux/lightnvm.h     |   6 +--
 4 files changed, 117 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index ae1fb2bdc5f4..8cfc0114ff13 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -64,19 +64,22 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
 	return 0;
 }
 
-static int gennvm_block_bb(u32 lun_id, void *bb_bitmap, unsigned int nr_blocks,
+static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks,
 								void *private)
 {
 	struct gen_nvm *gn = private;
-	struct gen_lun *lun = &gn->luns[lun_id];
+	struct nvm_dev *dev = gn->dev;
+	struct gen_lun *lun;
 	struct nvm_block *blk;
 	int i;
 
-	if (unlikely(bitmap_empty(bb_bitmap, nr_blocks)))
-		return 0;
+	ppa = addr_to_generic_mode(gn->dev, ppa);
+	lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun];
+
+	for (i = 0; i < nr_blocks; i++) {
+		if (blks[i] == 0)
+			continue;
 
-	i = -1;
-	while ((i = find_next_bit(bb_bitmap, nr_blocks, i + 1)) < nr_blocks) {
 		blk = &lun->vlun.blocks[i];
 		if (!blk) {
 			pr_err("gennvm: BB data is out of bounds.\n");
@@ -171,8 +174,16 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 		}
 
 		if (dev->ops->get_bb_tbl) {
-			ret = dev->ops->get_bb_tbl(dev->q, lun->vlun.id,
-					dev->blks_per_lun, gennvm_block_bb, gn);
+			struct ppa_addr ppa;
+
+			ppa.ppa = 0;
+			ppa.g.ch = lun->vlun.chnl_id;
+			ppa.g.lun = lun->vlun.id;
+			ppa = generic_to_addr_mode(dev, ppa);
+
+			ret = dev->ops->get_bb_tbl(dev->q, ppa,
+						dev->blks_per_lun,
+						gennvm_block_bb, gn);
 			if (ret)
 				pr_err("gennvm: could not read BB table\n");
 		}
@@ -199,6 +210,7 @@ static int gennvm_register(struct nvm_dev *dev)
 	if (!gn)
 		return -ENOMEM;
 
+	gn->dev = dev;
 	gn->nr_luns = dev->nr_luns;
 	dev->mp = gn;
 
@@ -354,10 +366,10 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
 {
 	int i;
 
-	if (!dev->ops->set_bb)
+	if (!dev->ops->set_bb_tbl)
 		return;
 
-	if (dev->ops->set_bb(dev->q, rqd, 1))
+	if (dev->ops->set_bb_tbl(dev->q, rqd, 1))
 		return;
 
 	gennvm_addr_to_generic_mode(dev, rqd);
diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h
index d23bd3501ddc..9c24b5b32dac 100644
--- a/drivers/lightnvm/gennvm.h
+++ b/drivers/lightnvm/gennvm.h
@@ -35,6 +35,8 @@ struct gen_lun {
 };
 
 struct gen_nvm {
+	struct nvm_dev *dev;
+
 	int nr_luns;
 	struct gen_lun *luns;
 };
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index e0b7b95813bc..2c3546516300 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -93,7 +93,7 @@ struct nvme_nvm_l2ptbl {
 	__le16			cdw14[6];
 };
 
-struct nvme_nvm_bbtbl {
+struct nvme_nvm_getbbtbl {
 	__u8			opcode;
 	__u8			flags;
 	__u16			command_id;
@@ -101,10 +101,23 @@ struct nvme_nvm_bbtbl {
 	__u64			rsvd[2];
 	__le64			prp1;
 	__le64			prp2;
-	__le32			prp1_len;
-	__le32			prp2_len;
-	__le32			lbb;
-	__u32			rsvd11[3];
+	__le64			spba;
+	__u32			rsvd4[4];
+};
+
+struct nvme_nvm_setbbtbl {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__le64			rsvd[2];
+	__le64			prp1;
+	__le64			prp2;
+	__le64			spba;
+	__le16			nlb;
+	__u8			value;
+	__u8			rsvd3;
+	__u32			rsvd4[3];
 };
 
 struct nvme_nvm_erase_blk {
@@ -129,8 +142,8 @@ struct nvme_nvm_command {
 		struct nvme_nvm_hb_rw hb_rw;
 		struct nvme_nvm_ph_rw ph_rw;
 		struct nvme_nvm_l2ptbl l2p;
-		struct nvme_nvm_bbtbl get_bb;
-		struct nvme_nvm_bbtbl set_bb;
+		struct nvme_nvm_getbbtbl get_bb;
+		struct nvme_nvm_setbbtbl set_bb;
 		struct nvme_nvm_erase_blk erase;
 	};
 };
@@ -187,6 +200,20 @@ struct nvme_nvm_id {
 	struct nvme_nvm_id_group groups[4];
 } __packed;
 
+struct nvme_nvm_bb_tbl {
+	__u8	tblid[4];
+	__le16	verid;
+	__le16	revid;
+	__le32	rvsd1;
+	__le32	tblks;
+	__le32	tfact;
+	__le32	tgrown;
+	__le32	tdresv;
+	__le32	thresv;
+	__le32	rsvd2[8];
+	__u8	blk[0];
+};
+
 /*
  * Check we didn't inadvertently grow the command struct
  */
@@ -195,12 +222,14 @@ static inline void _nvme_nvm_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_bbtbl) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 128);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != 4096);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 512);
 }
 
 static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
@@ -322,43 +351,80 @@ out:
 	return ret;
 }
 
-static int nvme_nvm_get_bb_tbl(struct request_queue *q, int lunid,
-				unsigned int nr_blocks,
-				nvm_bb_update_fn *update_bbtbl, void *priv)
+static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa,
+				int nr_blocks, nvm_bb_update_fn *update_bbtbl,
+				void *priv)
 {
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_nvm_command c = {};
-	void *bb_bitmap;
-	u16 bb_bitmap_size;
+	struct nvme_nvm_bb_tbl *bb_tbl;
+	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks;
 	int ret = 0;
 
 	c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl;
 	c.get_bb.nsid = cpu_to_le32(ns->ns_id);
-	c.get_bb.lbb = cpu_to_le32(lunid);
-	bb_bitmap_size = ((nr_blocks >> 15) + 1) * PAGE_SIZE;
-	bb_bitmap = kmalloc(bb_bitmap_size, GFP_KERNEL);
-	if (!bb_bitmap)
-		return -ENOMEM;
+	c.get_bb.spba = cpu_to_le64(ppa.ppa);
 
-	bitmap_zero(bb_bitmap, nr_blocks);
+	bb_tbl = kzalloc(tblsz, GFP_KERNEL);
+	if (!bb_tbl)
+		return -ENOMEM;
 
-	ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, bb_bitmap,
-								bb_bitmap_size);
+	ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, bb_tbl, tblsz);
 	if (ret) {
 		dev_err(dev->dev, "get bad block table failed (%d)\n", ret);
 		ret = -EIO;
 		goto out;
 	}
 
-	ret = update_bbtbl(lunid, bb_bitmap, nr_blocks, priv);
+	if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' ||
+		bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') {
+		dev_err(dev->dev, "bbt format mismatch\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (le16_to_cpu(bb_tbl->verid) != 1) {
+		ret = -EINVAL;
+		dev_err(dev->dev, "bbt version not supported\n");
+		goto out;
+	}
+
+	if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) {
+		ret = -EINVAL;
+		dev_err(dev->dev, "bbt unsuspected blocks returned (%u!=%u)",
+					le32_to_cpu(bb_tbl->tblks), nr_blocks);
+		goto out;
+	}
+
+	ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv);
 	if (ret) {
 		ret = -EINTR;
 		goto out;
 	}
 
 out:
-	kfree(bb_bitmap);
+	kfree(bb_tbl);
+	return ret;
+}
+
+static int nvme_nvm_set_bb_tbl(struct request_queue *q, struct nvm_rq *rqd,
+								int type)
+{
+	struct nvme_ns *ns = q->queuedata;
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_nvm_command c = {};
+	int ret = 0;
+
+	c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl;
+	c.set_bb.nsid = cpu_to_le32(ns->ns_id);
+	c.set_bb.spba = cpu_to_le64(rqd->ppa_addr.ppa);
+	c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1);
+	c.set_bb.value = type;
+
+	ret = nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0);
+	if (ret)
+		dev_err(dev->dev, "set bad block table failed (%d)\n", ret);
 	return ret;
 }
 
@@ -474,6 +540,7 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.get_l2p_tbl		= nvme_nvm_get_l2p_tbl,
 
 	.get_bb_tbl		= nvme_nvm_get_bb_tbl,
+	.set_bb_tbl		= nvme_nvm_set_bb_tbl,
 
 	.submit_io		= nvme_nvm_submit_io,
 	.erase_block		= nvme_nvm_erase_block,
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 32b5369e814e..9b3dc1bc9296 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -191,11 +191,11 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata)
 struct nvm_block;
 
 typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
-typedef int (nvm_bb_update_fn)(u32, void *, unsigned int, void *);
+typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *);
 typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *);
 typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32,
 				nvm_l2p_update_fn *, void *);
-typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, int, unsigned int,
+typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, struct ppa_addr, int,
 				nvm_bb_update_fn *, void *);
 typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int);
 typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *);
@@ -210,7 +210,7 @@ struct nvm_dev_ops {
 	nvm_id_fn		*identity;
 	nvm_get_l2p_tbl_fn	*get_l2p_tbl;
 	nvm_op_bb_tbl_fn	*get_bb_tbl;
-	nvm_op_set_bb_fn	*set_bb;
+	nvm_op_set_bb_fn	*set_bb_tbl;
 
 	nvm_submit_io_fn	*submit_io;
 	nvm_erase_blk_fn	*erase_block;
-- 
cgit v1.2.3-71-gd317


From 12be5edf68e785dd5dc8665db5a88152b49c1fe8 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Mon, 16 Nov 2015 15:34:39 +0100
Subject: lightnvm: expose mccap in identify command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The mccap field is required for I/O command option support. It defines the
following flash access modes:

 * SLC mode
 * Erase/Program Suspension
 * Scramble On/Off
 * Encryption

It is slotted in between mpos and cpar, changing the offset for
cpar as well.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/lightnvm.c | 4 +++-
 include/linux/lightnvm.h     | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 60687ed68b5d..52b311cf694c 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -169,8 +169,9 @@ struct nvme_nvm_id_group {
 	__le32			tbet;
 	__le32			tbem;
 	__le32			mpos;
+	__le32			mccap;
 	__le16			cpar;
-	__u8			reserved[910];
+	__u8			reserved[906];
 } __packed;
 
 struct nvme_nvm_addr_format {
@@ -265,6 +266,7 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
 		dst->tbet = le32_to_cpu(src->tbet);
 		dst->tbem = le32_to_cpu(src->tbem);
 		dst->mpos = le32_to_cpu(src->mpos);
+		dst->mccap = le32_to_cpu(src->mccap);
 
 		dst->cpar = le16_to_cpu(src->cpar);
 	}
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 9b3dc1bc9296..2572856e2a89 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -74,6 +74,7 @@ struct nvm_id_group {
 	u32	tbet;
 	u32	tbem;
 	u32	mpos;
+	u32	mccap;
 	u16	cpar;
 	u8	res[913];
 } __packed;
-- 
cgit v1.2.3-71-gd317


From 73387e7bed260c89628fc6a4e3632b45be9776b0 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Mon, 16 Nov 2015 15:34:40 +0100
Subject: lightnvm: remove unused attrs in nvm_id structs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nvm_id, nvm_id_group and nvm_addr_format data structures contain
reserved attributes. They are unused by media managers and targets.
Remove them.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/lightnvm.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 2572856e2a89..e6ef8aaf533f 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -58,7 +58,6 @@ enum {
 struct nvm_id_group {
 	u8	mtype;
 	u8	fmtype;
-	u16	res16;
 	u8	num_ch;
 	u8	num_lun;
 	u8	num_pln;
@@ -76,8 +75,7 @@ struct nvm_id_group {
 	u32	mpos;
 	u32	mccap;
 	u16	cpar;
-	u8	res[913];
-} __packed;
+};
 
 struct nvm_addr_format {
 	u8	ch_offset;
@@ -92,19 +90,16 @@ struct nvm_addr_format {
 	u8	pg_len;
 	u8	sect_offset;
 	u8	sect_len;
-	u8	res[4];
 };
 
 struct nvm_id {
 	u8	ver_id;
 	u8	vmnt;
 	u8	cgrps;
-	u8	res[5];
 	u32	cap;
 	u32	dom;
 	struct nvm_addr_format ppaf;
 	u8	ppat;
-	u8	resv[224];
 	struct nvm_id_group groups[4];
 } __packed;
 
-- 
cgit v1.2.3-71-gd317


From 7386af270c72be65c7cb2ba4ad0d4e70dc373106 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Mon, 16 Nov 2015 15:34:44 +0100
Subject: lightnvm: remove linear and device addr modes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The linear and device specific address modes can be replaced with a
simple offset and bit length conversion that is generic across all
devices.

This both simplifies the specification and removes the special case for
qemu nvme, that previously relied on the linear address mapping.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      |   3 +-
 drivers/lightnvm/gennvm.c    |  12 ++--
 drivers/lightnvm/rrpc.c      |  32 ++++++++-
 drivers/nvme/host/lightnvm.c |   3 +-
 include/linux/lightnvm.h     | 154 ++++++++++---------------------------------
 5 files changed, 73 insertions(+), 131 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 899f6b9a9f68..790b1d7a8d43 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -174,8 +174,7 @@ static int nvm_core_init(struct nvm_dev *dev)
 	dev->sec_size = grp->csecs;
 	dev->oob_size = grp->sos;
 	dev->sec_per_pg = grp->fpg_sz / grp->csecs;
-	dev->addr_mode = id->ppat;
-	dev->addr_format = id->ppaf;
+	memcpy(&dev->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
 
 	dev->plane_mode = NVM_PLANE_SINGLE;
 	dev->max_rq_size = dev->ops->max_phys_sect * dev->sec_size;
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 8cfc0114ff13..c0d0eb2357a8 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -73,7 +73,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks,
 	struct nvm_block *blk;
 	int i;
 
-	ppa = addr_to_generic_mode(gn->dev, ppa);
+	ppa = dev_to_generic_addr(gn->dev, ppa);
 	lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun];
 
 	for (i = 0; i < nr_blocks; i++) {
@@ -179,7 +179,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 			ppa.ppa = 0;
 			ppa.g.ch = lun->vlun.chnl_id;
 			ppa.g.lun = lun->vlun.id;
-			ppa = generic_to_addr_mode(dev, ppa);
+			ppa = generic_to_dev_addr(dev, ppa);
 
 			ret = dev->ops->get_bb_tbl(dev->q, ppa,
 						dev->blks_per_lun,
@@ -304,10 +304,10 @@ static void gennvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
 
 	if (rqd->nr_pages > 1) {
 		for (i = 0; i < rqd->nr_pages; i++)
-			rqd->ppa_list[i] = addr_to_generic_mode(dev,
+			rqd->ppa_list[i] = dev_to_generic_addr(dev,
 							rqd->ppa_list[i]);
 	} else {
-		rqd->ppa_addr = addr_to_generic_mode(dev, rqd->ppa_addr);
+		rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr);
 	}
 }
 
@@ -317,10 +317,10 @@ static void gennvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
 
 	if (rqd->nr_pages > 1) {
 		for (i = 0; i < rqd->nr_pages; i++)
-			rqd->ppa_list[i] = generic_to_addr_mode(dev,
+			rqd->ppa_list[i] = generic_to_dev_addr(dev,
 							rqd->ppa_list[i]);
 	} else {
-		rqd->ppa_addr = generic_to_addr_mode(dev, rqd->ppa_addr);
+		rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
 	}
 }
 
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 7ba64c87ba1c..75e59c3a3f96 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -123,12 +123,42 @@ static u64 block_to_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
 	return blk->id * rrpc->dev->pgs_per_blk;
 }
 
+static struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev,
+							struct ppa_addr r)
+{
+	struct ppa_addr l;
+	int secs, pgs, blks, luns;
+	sector_t ppa = r.ppa;
+
+	l.ppa = 0;
+
+	div_u64_rem(ppa, dev->sec_per_pg, &secs);
+	l.g.sec = secs;
+
+	sector_div(ppa, dev->sec_per_pg);
+	div_u64_rem(ppa, dev->sec_per_blk, &pgs);
+	l.g.pg = pgs;
+
+	sector_div(ppa, dev->pgs_per_blk);
+	div_u64_rem(ppa, dev->blks_per_lun, &blks);
+	l.g.blk = blks;
+
+	sector_div(ppa, dev->blks_per_lun);
+	div_u64_rem(ppa, dev->luns_per_chnl, &luns);
+	l.g.lun = luns;
+
+	sector_div(ppa, dev->luns_per_chnl);
+	l.g.ch = ppa;
+
+	return l;
+}
+
 static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_dev *dev, u64 addr)
 {
 	struct ppa_addr paddr;
 
 	paddr.ppa = addr;
-	return __linear_to_generic_addr(dev, paddr);
+	return linear_to_generic_addr(dev, paddr);
 }
 
 /* requires lun->lock taken */
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 52b311cf694c..9069be811f82 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -198,8 +198,7 @@ struct nvme_nvm_id {
 	__le32			cap;
 	__le32			dom;
 	struct nvme_nvm_addr_format ppaf;
-	__u8			ppat;
-	__u8			resv[223];
+	__u8			resv[224];
 	struct nvme_nvm_id_group groups[4];
 } __packed;
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index e6ef8aaf533f..cbe288acb1de 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -99,7 +99,6 @@ struct nvm_id {
 	u32	cap;
 	u32	dom;
 	struct nvm_addr_format ppaf;
-	u8	ppat;
 	struct nvm_id_group groups[4];
 } __packed;
 
@@ -119,39 +118,28 @@ struct nvm_tgt_instance {
 #define NVM_VERSION_MINOR 0
 #define NVM_VERSION_PATCH 0
 
-#define NVM_SEC_BITS (8)
-#define NVM_PL_BITS  (6)
-#define NVM_PG_BITS  (16)
 #define NVM_BLK_BITS (16)
-#define NVM_LUN_BITS (10)
+#define NVM_PG_BITS  (16)
+#define NVM_SEC_BITS (8)
+#define NVM_PL_BITS  (8)
+#define NVM_LUN_BITS (8)
 #define NVM_CH_BITS  (8)
 
 struct ppa_addr {
+	/* Generic structure for all addresses */
 	union {
-		/* Channel-based PPA format in nand 4x2x2x2x8x10 */
-		struct {
-			u64 ch		: 4;
-			u64 sec		: 2; /* 4 sectors per page */
-			u64 pl		: 2; /* 4 planes per LUN */
-			u64 lun		: 2; /* 4 LUNs per channel */
-			u64 pg		: 8; /* 256 pages per block */
-			u64 blk		: 10;/* 1024 blocks per plane */
-			u64 resved		: 36;
-		} chnl;
-
-		/* Generic structure for all addresses */
 		struct {
+			u64 blk		: NVM_BLK_BITS;
+			u64 pg		: NVM_PG_BITS;
 			u64 sec		: NVM_SEC_BITS;
 			u64 pl		: NVM_PL_BITS;
-			u64 pg		: NVM_PG_BITS;
-			u64 blk		: NVM_BLK_BITS;
 			u64 lun		: NVM_LUN_BITS;
 			u64 ch		: NVM_CH_BITS;
 		} g;
 
 		u64 ppa;
 	};
-} __packed;
+};
 
 struct nvm_rq {
 	struct nvm_tgt_instance *ins;
@@ -259,8 +247,7 @@ struct nvm_dev {
 	int blks_per_lun;
 	int sec_size;
 	int oob_size;
-	int addr_mode;
-	struct nvm_addr_format addr_format;
+	struct nvm_addr_format ppaf;
 
 	/* Calculated/Cached values. These do not reflect the actual usable
 	 * blocks at run-time.
@@ -286,118 +273,45 @@ struct nvm_dev {
 	char name[DISK_NAME_LEN];
 };
 
-/* fallback conversion */
-static struct ppa_addr __generic_to_linear_addr(struct nvm_dev *dev,
-							struct ppa_addr r)
-{
-	struct ppa_addr l;
-
-	l.ppa = r.g.sec +
-		r.g.pg  * dev->sec_per_pg +
-		r.g.blk * (dev->pgs_per_blk *
-				dev->sec_per_pg) +
-		r.g.lun * (dev->blks_per_lun *
-				dev->pgs_per_blk *
-				dev->sec_per_pg) +
-		r.g.ch * (dev->blks_per_lun *
-				dev->pgs_per_blk *
-				dev->luns_per_chnl *
-				dev->sec_per_pg);
-
-	return l;
-}
-
-/* fallback conversion */
-static struct ppa_addr __linear_to_generic_addr(struct nvm_dev *dev,
-							struct ppa_addr r)
-{
-	struct ppa_addr l;
-	int secs, pgs, blks, luns;
-	sector_t ppa = r.ppa;
-
-	l.ppa = 0;
-
-	div_u64_rem(ppa, dev->sec_per_pg, &secs);
-	l.g.sec = secs;
-
-	sector_div(ppa, dev->sec_per_pg);
-	div_u64_rem(ppa, dev->sec_per_blk, &pgs);
-	l.g.pg = pgs;
-
-	sector_div(ppa, dev->pgs_per_blk);
-	div_u64_rem(ppa, dev->blks_per_lun, &blks);
-	l.g.blk = blks;
-
-	sector_div(ppa, dev->blks_per_lun);
-	div_u64_rem(ppa, dev->luns_per_chnl, &luns);
-	l.g.lun = luns;
-
-	sector_div(ppa, dev->luns_per_chnl);
-	l.g.ch = ppa;
-
-	return l;
-}
-
-static struct ppa_addr __generic_to_chnl_addr(struct ppa_addr r)
+static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
+						struct ppa_addr r)
 {
 	struct ppa_addr l;
 
-	l.ppa = 0;
-
-	l.chnl.sec = r.g.sec;
-	l.chnl.pl = r.g.pl;
-	l.chnl.pg = r.g.pg;
-	l.chnl.blk = r.g.blk;
-	l.chnl.lun = r.g.lun;
-	l.chnl.ch = r.g.ch;
+	l.ppa = ((u64)r.g.blk) << dev->ppaf.blk_offset;
+	l.ppa |= ((u64)r.g.pg) << dev->ppaf.pg_offset;
+	l.ppa |= ((u64)r.g.sec) << dev->ppaf.sect_offset;
+	l.ppa |= ((u64)r.g.pl) << dev->ppaf.pln_offset;
+	l.ppa |= ((u64)r.g.lun) << dev->ppaf.lun_offset;
+	l.ppa |= ((u64)r.g.ch) << dev->ppaf.ch_offset;
 
 	return l;
 }
 
-static struct ppa_addr __chnl_to_generic_addr(struct ppa_addr r)
+static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev,
+						struct ppa_addr r)
 {
 	struct ppa_addr l;
 
-	l.ppa = 0;
-
-	l.g.sec = r.chnl.sec;
-	l.g.pl = r.chnl.pl;
-	l.g.pg = r.chnl.pg;
-	l.g.blk = r.chnl.blk;
-	l.g.lun = r.chnl.lun;
-	l.g.ch = r.chnl.ch;
+	/*
+	 * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc.
+	 */
+	l.g.blk = (r.ppa >> dev->ppaf.blk_offset) &
+					(((1 << dev->ppaf.blk_len) - 1));
+	l.g.pg |= (r.ppa >> dev->ppaf.pg_offset) &
+					(((1 << dev->ppaf.pg_len) - 1));
+	l.g.sec |= (r.ppa >> dev->ppaf.sect_offset) &
+					(((1 << dev->ppaf.sect_len) - 1));
+	l.g.pl |= (r.ppa >> dev->ppaf.pln_offset) &
+					(((1 << dev->ppaf.pln_len) - 1));
+	l.g.lun |= (r.ppa >> dev->ppaf.lun_offset) &
+					(((1 << dev->ppaf.lun_len) - 1));
+	l.g.ch |= (r.ppa >> dev->ppaf.ch_offset) &
+					(((1 << dev->ppaf.ch_len) - 1));
 
 	return l;
 }
 
-static inline struct ppa_addr addr_to_generic_mode(struct nvm_dev *dev,
-						struct ppa_addr gppa)
-{
-	switch (dev->addr_mode) {
-	case NVM_ADDRMODE_LINEAR:
-		return __linear_to_generic_addr(dev, gppa);
-	case NVM_ADDRMODE_CHANNEL:
-		return __chnl_to_generic_addr(gppa);
-	default:
-		BUG();
-	}
-	return gppa;
-}
-
-static inline struct ppa_addr generic_to_addr_mode(struct nvm_dev *dev,
-						struct ppa_addr gppa)
-{
-	switch (dev->addr_mode) {
-	case NVM_ADDRMODE_LINEAR:
-		return __generic_to_linear_addr(dev, gppa);
-	case NVM_ADDRMODE_CHANNEL:
-		return __generic_to_chnl_addr(gppa);
-	default:
-		BUG();
-	}
-	return gppa;
-}
-
 static inline int ppa_empty(struct ppa_addr ppa_addr)
 {
 	return (ppa_addr.ppa == ADDR_EMPTY);
-- 
cgit v1.2.3-71-gd317


From 451c2b5caf37b526ae34a1081b71115e1de2d063 Mon Sep 17 00:00:00 2001
From: Aya Mahfouz <mahfouz.saif.elyazal@gmail.com>
Date: Wed, 18 Nov 2015 08:36:44 +0200
Subject: net: dns_resolver: convert time_t to time64_t

Changes the definition of the pointer _expiry from time_t to
time64_t. This is to handle the Y2038 problem where time_t
will overflow in the year 2038. The change is safe because
the kernel subsystems that call dns_query pass NULL.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Aya Mahfouz <mahfouz.saif.elyazal@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dns_resolver.h | 2 +-
 net/dns_resolver/dns_query.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dns_resolver.h b/include/linux/dns_resolver.h
index cc92268af89a..6ac3cad9aef1 100644
--- a/include/linux/dns_resolver.h
+++ b/include/linux/dns_resolver.h
@@ -27,7 +27,7 @@
 #ifdef __KERNEL__
 
 extern int dns_query(const char *type, const char *name, size_t namelen,
-		     const char *options, char **_result, time_t *_expiry);
+		     const char *options, char **_result, time64_t *_expiry);
 
 #endif /* KERNEL */
 
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 4677b6fa6dda..ecc28cff08ab 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -67,7 +67,7 @@
  * Returns the size of the result on success, -ve error code otherwise.
  */
 int dns_query(const char *type, const char *name, size_t namelen,
-	      const char *options, char **_result, time_t *_expiry)
+	      const char *options, char **_result, time64_t *_expiry)
 {
 	struct key *rkey;
 	const struct user_key_payload *upayload;
-- 
cgit v1.2.3-71-gd317


From db27a7a37aa0b1f8b373f8b0fb72a2ccaafb85b7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Thu, 5 Nov 2015 09:03:50 +0100
Subject: KVM: Provide function for VCPU lookup by id

Let's provide a function to lookup a VCPU by id.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
[split patch from refactoring patch]
---
 include/linux/kvm_host.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5706a2108f0a..c923350ca20a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -460,6 +460,17 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 	     (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \
 	     idx++)
 
+static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		if (vcpu->vcpu_id == id)
+			return vcpu;
+	return NULL;
+}
+
 #define kvm_for_each_memslot(memslot, slots)	\
 	for (memslot = &slots->memslots[0];	\
 	      memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\
-- 
cgit v1.2.3-71-gd317


From 851df3dc11136fde86ebd78ee7527cb43c7cd349 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 16 Nov 2015 22:34:58 +0100
Subject: scpi: hide get_scpi_ops in module from built-in code

The scpi_clock driver can be built-in when CONFIG_COMPILE_TEST
is set even when ARM_SCPI_PROTOCOL is a loadable module, and
that results in a link error:

drivers/built-in.o: In function `scpi_clocks_probe':
(.text+0x14453c): undefined reference to `get_scpi_ops'

Using #if IS_REACHABLE() around the get_scpi_ops() declaration
makes it build successfully in this case for compile-testing,
but the effect is the same as when ARM_SCPI_PROTOCOL is
disabled, as the code will not be used.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Punit Agrawal <punit.agrawal@arm.com>
---
 include/linux/scpi_protocol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/scpi_protocol.h b/include/linux/scpi_protocol.h
index 80af3cd35ae4..72ce932c69b2 100644
--- a/include/linux/scpi_protocol.h
+++ b/include/linux/scpi_protocol.h
@@ -71,7 +71,7 @@ struct scpi_ops {
 	int (*sensor_get_value)(u16, u32 *);
 };
 
-#if IS_ENABLED(CONFIG_ARM_SCPI_PROTOCOL)
+#if IS_REACHABLE(CONFIG_ARM_SCPI_PROTOCOL)
 struct scpi_ops *get_scpi_ops(void);
 #else
 static inline struct scpi_ops *get_scpi_ops(void) { return NULL; }
-- 
cgit v1.2.3-71-gd317


From 2e6edc95382cc36423aff18a237173ad62d5ab52 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 19 Nov 2015 13:29:28 -0800
Subject: block: protect rw_page against device teardown

Fix use after free crashes like the following:

 general protection fault: 0000 [#1] SMP
 Call Trace:
  [<ffffffffa0050216>] ? pmem_do_bvec.isra.12+0xa6/0xf0 [nd_pmem]
  [<ffffffffa0050ba2>] pmem_rw_page+0x42/0x80 [nd_pmem]
  [<ffffffff8128fd90>] bdev_read_page+0x50/0x60
  [<ffffffff812972f0>] do_mpage_readpage+0x510/0x770
  [<ffffffff8128fd20>] ? I_BDEV+0x20/0x20
  [<ffffffff811d86dc>] ? lru_cache_add+0x1c/0x50
  [<ffffffff81297657>] mpage_readpages+0x107/0x170
  [<ffffffff8128fd20>] ? I_BDEV+0x20/0x20
  [<ffffffff8128fd20>] ? I_BDEV+0x20/0x20
  [<ffffffff8129058d>] blkdev_readpages+0x1d/0x20
  [<ffffffff811d615f>] __do_page_cache_readahead+0x28f/0x310
  [<ffffffff811d6039>] ? __do_page_cache_readahead+0x169/0x310
  [<ffffffff811c5abd>] ? pagecache_get_page+0x2d/0x1d0
  [<ffffffff811c76f6>] filemap_fault+0x396/0x530
  [<ffffffff811f816e>] __do_fault+0x4e/0xf0
  [<ffffffff811fce7d>] handle_mm_fault+0x11bd/0x1b50

Cc: <stable@vger.kernel.org>
Cc: Jens Axboe <axboe@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Reported-by: kbuild test robot <lkp@intel.com>
Acked-by: Matthew Wilcox <willy@linux.intel.com>
[willy: symmetry fixups]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/blk.h            |  2 --
 fs/block_dev.c         | 18 ++++++++++++++++--
 include/linux/blkdev.h |  2 ++
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk.h b/block/blk.h
index da722eb786df..c43926d3d74d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -72,8 +72,6 @@ void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
 			    unsigned int nr_bytes, unsigned int bidi_bytes);
-int blk_queue_enter(struct request_queue *q, gfp_t gfp);
-void blk_queue_exit(struct request_queue *q);
 void blk_freeze_queue(struct request_queue *q);
 
 static inline void blk_queue_enter_live(struct request_queue *q)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bb0dfb1c7af1..c25639e907bd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -390,9 +390,17 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 			struct page *page)
 {
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
+	int result = -EOPNOTSUPP;
+
 	if (!ops->rw_page || bdev_get_integrity(bdev))
-		return -EOPNOTSUPP;
-	return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+		return result;
+
+	result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+	if (result)
+		return result;
+	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+	blk_queue_exit(bdev->bd_queue);
+	return result;
 }
 EXPORT_SYMBOL_GPL(bdev_read_page);
 
@@ -421,14 +429,20 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 	int result;
 	int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
+
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
+	result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+	if (result)
+		return result;
+
 	set_page_writeback(page);
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
 	if (result)
 		end_page_writeback(page);
 	else
 		unlock_page(page);
+	blk_queue_exit(bdev->bd_queue);
 	return result;
 }
 EXPORT_SYMBOL_GPL(bdev_write_page);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3fe27f8d91f0..c0d2b7927c1f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -794,6 +794,8 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
+extern int blk_queue_enter(struct request_queue *q, gfp_t gfp);
+extern void blk_queue_exit(struct request_queue *q);
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
-- 
cgit v1.2.3-71-gd317


From 0b59733b95f9d7af6bee6e6a4d0d444eb694c514 Mon Sep 17 00:00:00 2001
From: Javier Gonzalez <javier@javigon.com>
Date: Fri, 20 Nov 2015 13:47:56 +0100
Subject: lightnvm: keep track of block counts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Maintain number of in use blocks, free blocks, and bad blocks in a per
lun basis. This allows the upper layers to get information about the
state of each lun.

Also, account for blocks reserved to the device on the free block count.
nr_free_blocks matches now the actual number of blocks on the free list
when the device is booted.

Signed-off-by: Javier Gonzalez <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c | 14 +++++++++++++-
 include/linux/lightnvm.h  |  2 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index c0d0eb2357a8..43c01e0af887 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -60,6 +60,8 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
 		lun->vlun.lun_id = i % dev->luns_per_chnl;
 		lun->vlun.chnl_id = i / dev->luns_per_chnl;
 		lun->vlun.nr_free_blocks = dev->blks_per_lun;
+		lun->vlun.nr_inuse_blocks = 0;
+		lun->vlun.nr_bad_blocks = 0;
 	}
 	return 0;
 }
@@ -87,6 +89,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks,
 		}
 
 		list_move_tail(&blk->list, &lun->bb_list);
+		lun->vlun.nr_bad_blocks++;
 	}
 
 	return 0;
@@ -139,6 +142,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 			list_move_tail(&blk->list, &lun->used_list);
 			blk->type = 1;
 			lun->vlun.nr_free_blocks--;
+			lun->vlun.nr_inuse_blocks++;
 		}
 	}
 
@@ -167,8 +171,10 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 			block->id = cur_block_id++;
 
 			/* First block is reserved for device */
-			if (unlikely(lun_iter == 0 && blk_iter == 0))
+			if (unlikely(lun_iter == 0 && blk_iter == 0)) {
+				lun->vlun.nr_free_blocks--;
 				continue;
+			}
 
 			list_add_tail(&block->list, &lun->free_list);
 		}
@@ -266,6 +272,7 @@ static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev,
 	blk->type = 1;
 
 	lun->vlun.nr_free_blocks--;
+	lun->vlun.nr_inuse_blocks++;
 
 	spin_unlock(&vlun->lock);
 out:
@@ -283,16 +290,21 @@ static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
 	case 1:
 		list_move_tail(&blk->list, &lun->free_list);
 		lun->vlun.nr_free_blocks++;
+		lun->vlun.nr_inuse_blocks--;
 		blk->type = 0;
 		break;
 	case 2:
 		list_move_tail(&blk->list, &lun->bb_list);
+		lun->vlun.nr_bad_blocks++;
+		lun->vlun.nr_inuse_blocks--;
 		break;
 	default:
 		WARN_ON_ONCE(1);
 		pr_err("gennvm: erroneous block type (%lu -> %u)\n",
 							blk->id, blk->type);
 		list_move_tail(&blk->list, &lun->bb_list);
+		lun->vlun.nr_bad_blocks++;
+		lun->vlun.nr_inuse_blocks--;
 	}
 
 	spin_unlock(&vlun->lock);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index cbe288acb1de..831a20cf070c 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -213,7 +213,9 @@ struct nvm_lun {
 	int lun_id;
 	int chnl_id;
 
+	unsigned int nr_inuse_blocks;	/* Number of used blocks */
 	unsigned int nr_free_blocks;	/* Number of unused blocks */
+	unsigned int nr_bad_blocks;	/* Number of bad blocks */
 	struct nvm_block *blocks;
 
 	spinlock_t lock;
-- 
cgit v1.2.3-71-gd317


From 2fde0e482db2b43bb4ed0e9aebfbe78ebcbbf5a6 Mon Sep 17 00:00:00 2001
From: Javier Gonzalez <javier@javigon.com>
Date: Fri, 20 Nov 2015 13:47:57 +0100
Subject: lightnvm: add free and bad lun info to show luns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add free block, used block, and bad block information to the show debug
interface. This information is used to debug how targets track blocks.

Also, change debug function name to make it more generic.

Signed-off-by: Javier Gonzalez <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c   |  2 +-
 drivers/lightnvm/gennvm.c | 19 ++++++++++++++-----
 include/linux/lightnvm.h  |  4 ++--
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index f61d325fd978..5178645ac42b 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -544,7 +544,7 @@ static int nvm_configure_show(const char *val)
 	if (!dev->mt)
 		return 0;
 
-	dev->mt->free_blocks_print(dev);
+	dev->mt->lun_info_print(dev);
 
 	return 0;
 }
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 43c01e0af887..e20e74ec6b91 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -464,15 +464,24 @@ static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid)
 	return &gn->luns[lunid].vlun;
 }
 
-static void gennvm_free_blocks_print(struct nvm_dev *dev)
+static void gennvm_lun_info_print(struct nvm_dev *dev)
 {
 	struct gen_nvm *gn = dev->mp;
 	struct gen_lun *lun;
 	unsigned int i;
 
-	gennvm_for_each_lun(gn, lun, i)
-		pr_info("%s: lun%8u\t%u\n",
-					dev->name, i, lun->vlun.nr_free_blocks);
+
+	gennvm_for_each_lun(gn, lun, i) {
+		spin_lock(&lun->vlun.lock);
+
+		pr_info("%s: lun%8u\t%u\t%u\t%u\n",
+				dev->name, i,
+				lun->vlun.nr_free_blocks,
+				lun->vlun.nr_inuse_blocks,
+				lun->vlun.nr_bad_blocks);
+
+		spin_unlock(&lun->vlun.lock);
+	}
 }
 
 static struct nvmm_type gennvm = {
@@ -490,7 +499,7 @@ static struct nvmm_type gennvm = {
 	.erase_blk	= gennvm_erase_blk,
 
 	.get_lun	= gennvm_get_lun,
-	.free_blocks_print = gennvm_free_blocks_print,
+	.lun_info_print = gennvm_lun_info_print,
 };
 
 static int __init gennvm_module_init(void)
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 831a20cf070c..3db5552b17d5 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -380,7 +380,7 @@ typedef int (nvmm_end_io_fn)(struct nvm_rq *, int);
 typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *,
 								unsigned long);
 typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int);
-typedef void (nvmm_free_blocks_print_fn)(struct nvm_dev *);
+typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *);
 
 struct nvmm_type {
 	const char *name;
@@ -404,7 +404,7 @@ struct nvmm_type {
 	nvmm_get_lun_fn *get_lun;
 
 	/* Statistics */
-	nvmm_free_blocks_print_fn *free_blocks_print;
+	nvmm_lun_info_print_fn *lun_info_print;
 	struct list_head list;
 };
 
-- 
cgit v1.2.3-71-gd317


From b11cfb5807e30333b36c02701382b820b7dcf0d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Nov 2015 15:55:52 -0500
Subject: cgroup: record ancestor IDs and reimplement cgroup_is_descendant()
 using it

cgroup_is_descendant() currently walks up the hierarchy and compares
each ancestor to the cgroup in question.  While enough for cgroup core
usages, this can't be used in hot paths to test cgroup membership.
This patch adds cgroup->ancestor_ids[] which records the IDs of all
ancestors including self and cgroup->level for the nesting level.

This allows testing whether a given cgroup is a descendant of another
in three finite steps - testing whether the two belong to the same
hierarchy, whether the descendant candidate is at the same or a higher
level than the ancestor and comparing the recorded ancestor_id at the
matching level.  cgroup_is_descendant() is accordingly reimplmented
and made inline.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 14 ++++++++++++++
 include/linux/cgroup.h      | 18 +++++++++++++++++-
 kernel/cgroup.c             | 32 ++++++++++----------------------
 3 files changed, 41 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 60d44b26276d..504d8591b6d3 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -234,6 +234,14 @@ struct cgroup {
 	 */
 	int id;
 
+	/*
+	 * The depth this cgroup is at.  The root is at depth zero and each
+	 * step down the hierarchy increments the level.  This along with
+	 * ancestor_ids[] can determine whether a given cgroup is a
+	 * descendant of another without traversing the hierarchy.
+	 */
+	int level;
+
 	/*
 	 * Each non-empty css_set associated with this cgroup contributes
 	 * one to populated_cnt.  All children with non-zero popuplated_cnt
@@ -289,6 +297,9 @@ struct cgroup {
 
 	/* used to schedule release agent */
 	struct work_struct release_agent_work;
+
+	/* ids of the ancestors at each level including self */
+	int ancestor_ids[];
 };
 
 /*
@@ -308,6 +319,9 @@ struct cgroup_root {
 	/* The root cgroup.  Root is destroyed on its release. */
 	struct cgroup cgrp;
 
+	/* for cgrp->ancestor_ids[0] */
+	int cgrp_ancestor_id_storage;
+
 	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
 	atomic_t nr_cgrps;
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 22e3754f89c5..b5ee2c4210f9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -81,7 +81,6 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 						       struct cgroup_subsys *ss);
 
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
@@ -459,6 +458,23 @@ static inline struct cgroup *task_cgroup(struct task_struct *task,
 	return task_css(task, subsys_id)->cgroup;
 }
 
+/**
+ * cgroup_is_descendant - test ancestry
+ * @cgrp: the cgroup to be tested
+ * @ancestor: possible ancestor of @cgrp
+ *
+ * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
+ * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+static inline bool cgroup_is_descendant(struct cgroup *cgrp,
+					struct cgroup *ancestor)
+{
+	if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
+		return false;
+	return cgrp->ancestor_ids[ancestor->level] == ancestor->id;
+}
+
 /* no synchronization, the result can only be used as a hint */
 static inline bool cgroup_is_populated(struct cgroup *cgrp)
 {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f1603c153890..3190040792c8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -459,25 +459,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 }
 EXPORT_SYMBOL_GPL(of_css);
 
-/**
- * cgroup_is_descendant - test ancestry
- * @cgrp: the cgroup to be tested
- * @ancestor: possible ancestor of @cgrp
- *
- * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
- * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
- * and @ancestor are accessible.
- */
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
-{
-	while (cgrp) {
-		if (cgrp == ancestor)
-			return true;
-		cgrp = cgroup_parent(cgrp);
-	}
-	return false;
-}
-
 static int notify_on_release(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -1903,6 +1884,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 	if (ret < 0)
 		goto out;
 	root_cgrp->id = ret;
+	root_cgrp->ancestor_ids[0] = ret;
 
 	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
 			      GFP_KERNEL);
@@ -4846,11 +4828,11 @@ err_free_css:
 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 			umode_t mode)
 {
-	struct cgroup *parent, *cgrp;
+	struct cgroup *parent, *cgrp, *tcgrp;
 	struct cgroup_root *root;
 	struct cgroup_subsys *ss;
 	struct kernfs_node *kn;
-	int ssid, ret;
+	int level, ssid, ret;
 
 	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
 	 */
@@ -4861,9 +4843,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	if (!parent)
 		return -ENODEV;
 	root = parent->root;
+	level = parent->level + 1;
 
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
-	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
+	cgrp = kzalloc(sizeof(*cgrp) +
+		       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
 	if (!cgrp) {
 		ret = -ENOMEM;
 		goto out_unlock;
@@ -4887,6 +4871,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
+	cgrp->level = level;
+
+	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
 
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-- 
cgit v1.2.3-71-gd317


From bd96f76a2454c6b97d70945902e30b4c31510678 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Nov 2015 15:55:52 -0500
Subject: kernfs: implement kernfs_walk_and_get()

Implement kernfs_walk_and_get() which is similar to
kernfs_find_and_get() but can walk a path instead of just a name.

v2: Use strlcpy() instead of strlen() + memcpy() as suggested by
    David.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: David Miller <davem@davemloft.net>
---
 fs/kernfs/dir.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kernfs.h | 12 ++++++++++++
 2 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 91e004518237..742bf4a230e8 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -694,6 +694,29 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
 	return NULL;
 }
 
+static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
+					  const unsigned char *path,
+					  const void *ns)
+{
+	static char path_buf[PATH_MAX];	/* protected by kernfs_mutex */
+	size_t len = strlcpy(path_buf, path, PATH_MAX);
+	char *p = path_buf;
+	char *name;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	if (len >= PATH_MAX)
+		return NULL;
+
+	while ((name = strsep(&p, "/")) && parent) {
+		if (*name == '\0')
+			continue;
+		parent = kernfs_find_ns(parent, name, ns);
+	}
+
+	return parent;
+}
+
 /**
  * kernfs_find_and_get_ns - find and get kernfs_node with the given name
  * @parent: kernfs_node to search under
@@ -718,6 +741,29 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
 }
 EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
 
+/**
+ * kernfs_walk_and_get_ns - find and get kernfs_node with the given path
+ * @parent: kernfs_node to search under
+ * @path: path to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with path @path under @parent and get a reference
+ * if found.  This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
+					   const char *path, const void *ns)
+{
+	struct kernfs_node *kn;
+
+	mutex_lock(&kernfs_mutex);
+	kn = kernfs_walk_ns(parent, path, ns);
+	kernfs_get(kn);
+	mutex_unlock(&kernfs_mutex);
+
+	return kn;
+}
+
 /**
  * kernfs_create_root - create a new kernfs hierarchy
  * @scops: optional syscall operations for the hierarchy
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 5d4e9c4b821d..af51df35d749 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -274,6 +274,8 @@ void pr_cont_kernfs_path(struct kernfs_node *kn);
 struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn);
 struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
 					   const char *name, const void *ns);
+struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
+					   const char *path, const void *ns);
 void kernfs_get(struct kernfs_node *kn);
 void kernfs_put(struct kernfs_node *kn);
 
@@ -350,6 +352,10 @@ static inline struct kernfs_node *
 kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
 		       const void *ns)
 { return NULL; }
+static inline struct kernfs_node *
+kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path,
+		       const void *ns)
+{ return NULL; }
 
 static inline void kernfs_get(struct kernfs_node *kn) { }
 static inline void kernfs_put(struct kernfs_node *kn) { }
@@ -430,6 +436,12 @@ kernfs_find_and_get(struct kernfs_node *kn, const char *name)
 	return kernfs_find_and_get_ns(kn, name, NULL);
 }
 
+static inline struct kernfs_node *
+kernfs_walk_and_get(struct kernfs_node *kn, const char *path)
+{
+	return kernfs_walk_and_get_ns(kn, path, NULL);
+}
+
 static inline struct kernfs_node *
 kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
 		  void *priv)
-- 
cgit v1.2.3-71-gd317


From 16af439645455fbf36984ca5e72f31073ee19ab7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Nov 2015 15:55:52 -0500
Subject: cgroup: implement cgroup_get_from_path() and expose cgroup_put()

Implement cgroup_get_from_path() using kernfs_walk_and_get() which
obtains a default hierarchy cgroup from its path.  This will be used
to allow cgroup path based matching from outside cgroup proper -
e.g. networking and perf.

v2: Add EXPORT_SYMBOL_GPL(cgroup_get_from_path).

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  7 +++++++
 kernel/cgroup.c        | 39 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 41 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b5ee2c4210f9..4c3ffab81ba7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -81,6 +81,8 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 						       struct cgroup_subsys *ss);
 
+struct cgroup *cgroup_get_from_path(const char *path);
+
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
@@ -351,6 +353,11 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
 		percpu_ref_put_many(&css->refcnt, n);
 }
 
+static inline void cgroup_put(struct cgroup *cgrp)
+{
+	css_put(&cgrp->self);
+}
+
 /**
  * task_css_set_check - obtain a task's css_set with extra access conditions
  * @task: the task to obtain css_set for
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3190040792c8..3db5e8f5b702 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -434,11 +434,6 @@ static bool cgroup_tryget(struct cgroup *cgrp)
 	return css_tryget(&cgrp->self);
 }
 
-static void cgroup_put(struct cgroup *cgrp)
-{
-	css_put(&cgrp->self);
-}
-
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 {
 	struct cgroup *cgrp = of->kn->parent->priv;
@@ -5753,6 +5748,40 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
 }
 
+/**
+ * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
+ * @path: path on the default hierarchy
+ *
+ * Find the cgroup at @path on the default hierarchy, increment its
+ * reference count and return it.  Returns pointer to the found cgroup on
+ * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
+ * if @path points to a non-directory.
+ */
+struct cgroup *cgroup_get_from_path(const char *path)
+{
+	struct kernfs_node *kn;
+	struct cgroup *cgrp;
+
+	mutex_lock(&cgroup_mutex);
+
+	kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+	if (kn) {
+		if (kernfs_type(kn) == KERNFS_DIR) {
+			cgrp = kn->priv;
+			cgroup_get(cgrp);
+		} else {
+			cgrp = ERR_PTR(-ENOTDIR);
+		}
+		kernfs_put(kn);
+	} else {
+		cgrp = ERR_PTR(-ENOENT);
+	}
+
+	mutex_unlock(&cgroup_mutex);
+	return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
-- 
cgit v1.2.3-71-gd317


From 94a58c360a45c066ab5472cfd2bf2a4ba63aa532 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 20 Nov 2015 15:56:48 -0800
Subject: slab.h: sprinkle __assume_aligned attributes

The various allocators return aligned memory.  Telling the compiler that
allows it to generate better code in many cases, for example when the
return value is immediately passed to memset().

Some code does become larger, but at least we win twice as much as we lose:

$ scripts/bloat-o-meter /tmp/vmlinux vmlinux
add/remove: 0/0 grow/shrink: 13/52 up/down: 995/-2140 (-1145)

An example of the different (and smaller) code can be seen in mm_alloc(). Before:

:       48 8d 78 08             lea    0x8(%rax),%rdi
:       48 89 c1                mov    %rax,%rcx
:       48 89 c2                mov    %rax,%rdx
:       48 c7 00 00 00 00 00    movq   $0x0,(%rax)
:       48 c7 80 48 03 00 00    movq   $0x0,0x348(%rax)
:       00 00 00 00
:       31 c0                   xor    %eax,%eax
:       48 83 e7 f8             and    $0xfffffffffffffff8,%rdi
:       48 29 f9                sub    %rdi,%rcx
:       81 c1 50 03 00 00       add    $0x350,%ecx
:       c1 e9 03                shr    $0x3,%ecx
:       f3 48 ab                rep stos %rax,%es:(%rdi)

After:

:       48 89 c2                mov    %rax,%rdx
:       b9 6a 00 00 00          mov    $0x6a,%ecx
:       31 c0                   xor    %eax,%eax
:       48 89 d7                mov    %rdx,%rdi
:       f3 48 ab                rep stos %rax,%es:(%rdi)

So gcc's strategy is to do two possibly (but not really, of course)
unaligned stores to the first and last word, then do an aligned rep stos
covering the middle part with a little overlap.  Maybe arches which do not
allow unaligned stores gain even more.

I don't know if gcc can actually make use of alignments greater than 8 for
anything, so one could probably drop the __assume_xyz_alignment macros and
just use __assume_aligned(8).

The increases in code size are mostly caused by gcc deciding to
opencode strlen() using the check-four-bytes-at-a-time trick when it
knows the buffer is sufficiently aligned (one function grew by 200
bytes). Now it turns out that many of these strlen() calls showing up
were in fact redundant, and they're gone from -next. Applying the two
patches to next-20151001 bloat-o-meter instead says

add/remove: 0/0 grow/shrink: 6/52 up/down: 244/-2140 (-1896)

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7c82e3b307a3..96940772bb92 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -157,6 +157,24 @@ size_t ksize(const void *);
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 #endif
 
+/*
+ * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
+ * Intended for arches that get misalignment faults even for 64 bit integer
+ * aligned buffers.
+ */
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
+#endif
+
+/*
+ * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned
+ * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN
+ * aligned pointers.
+ */
+#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
+#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
+#define __assume_page_alignment __assume_aligned(PAGE_SIZE)
+
 /*
  * Kmalloc array related definitions
  */
@@ -286,8 +304,8 @@ static __always_inline int kmalloc_index(size_t size)
 }
 #endif /* !CONFIG_SLOB */
 
-void *__kmalloc(size_t size, gfp_t flags);
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment;
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment;
 void kmem_cache_free(struct kmem_cache *, void *);
 
 /*
@@ -301,8 +319,8 @@ void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
 bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
 #ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node);
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
 #else
 static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
@@ -316,12 +334,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
 #endif
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment;
 
 #ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 					   gfp_t gfpflags,
-					   int node, size_t size);
+					   int node, size_t size) __assume_slab_alignment;
 #else
 static __always_inline void *
 kmem_cache_alloc_node_trace(struct kmem_cache *s,
@@ -354,10 +372,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 }
 #endif /* CONFIG_TRACING */
 
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
 
 #ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
 #else
 static __always_inline void *
 kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
@@ -482,15 +500,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 	return __kmalloc_node(size, flags, node);
 }
 
-/*
- * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
- * Intended for arches that get misalignment faults even for 64 bit integer
- * aligned buffers.
- */
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
-#endif
-
 struct memcg_cache_array {
 	struct rcu_head rcu;
 	struct kmem_cache *entries[0];
-- 
cgit v1.2.3-71-gd317


From 5cf6a51e6062afe7cc507f32f1e5f7e6497ae844 Mon Sep 17 00:00:00 2001
From: Daniel Baluta <daniel.baluta@intel.com>
Date: Fri, 20 Nov 2015 15:56:53 -0800
Subject: configfs: allow dynamic group creation

This patchset introduces IIO software triggers, offers a way of configuring
them via configfs and adds the IIO hrtimer based interrupt source to be used
with software triggers.

The architecture is now split in 3 parts, to remove all IIO trigger specific
parts from IIO configfs core:

(1) IIO configfs - creates the root of the IIO configfs subsys.
(2) IIO software triggers - software trigger implementation, dynamically
    creating /config/iio/triggers group.
(3) IIO hrtimer trigger - is the first interrupt source for software triggers
    (with syfs to follow). Each trigger type can implement its own set of
    attributes.

Lockdep seems to be happy with the locking in configfs patch.

This patch (of 5):

We don't want to hardcode default groups at subsystem
creation time. We export:
	* configfs_register_group
	* configfs_unregister_group
to allow drivers to programatically create/destroy groups
later, after module init time.

This is needed for IIO configfs support.

(akpm: the other 4 patches to be merged via the IIO tree)

Signed-off-by: Daniel Baluta <daniel.baluta@intel.com>
Suggested-by: Lars-Peter Clausen <lars@metafoo.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Joel Becker <jlbec@evilplan.org>
Cc: Hartmut Knaack <knaack.h@gmx.de>
Cc: Octavian Purdila <octavian.purdila@intel.com>
Cc: Paul Bolle <pebolle@tiscali.nl>
Cc: Adriana Reus <adriana.reus@intel.com>
Cc: Cristina Opriceana <cristina.opriceana@gmail.com>
Cc: Peter Meerwald <pmeerw@pmeerw.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/configfs/dir.c        | 110 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/configfs.h |  10 +++++
 2 files changed, 120 insertions(+)

(limited to 'include/linux')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index c81ce7f200a6..a7a1b218f308 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1636,6 +1636,116 @@ const struct file_operations configfs_dir_operations = {
 	.iterate	= configfs_readdir,
 };
 
+/**
+ * configfs_register_group - creates a parent-child relation between two groups
+ * @parent_group:	parent group
+ * @group:		child group
+ *
+ * link groups, creates dentry for the child and attaches it to the
+ * parent dentry.
+ *
+ * Return: 0 on success, negative errno code on error
+ */
+int configfs_register_group(struct config_group *parent_group,
+			    struct config_group *group)
+{
+	struct configfs_subsystem *subsys = parent_group->cg_subsys;
+	struct dentry *parent;
+	int ret;
+
+	mutex_lock(&subsys->su_mutex);
+	link_group(parent_group, group);
+	mutex_unlock(&subsys->su_mutex);
+
+	parent = parent_group->cg_item.ci_dentry;
+
+	mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+	ret = create_default_group(parent_group, group);
+	if (!ret) {
+		spin_lock(&configfs_dirent_lock);
+		configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
+		spin_unlock(&configfs_dirent_lock);
+	}
+	mutex_unlock(&d_inode(parent)->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(configfs_register_group);
+
+/**
+ * configfs_unregister_group() - unregisters a child group from its parent
+ * @group: parent group to be unregistered
+ *
+ * Undoes configfs_register_group()
+ */
+void configfs_unregister_group(struct config_group *group)
+{
+	struct configfs_subsystem *subsys = group->cg_subsys;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+	struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
+
+	mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+	spin_lock(&configfs_dirent_lock);
+	configfs_detach_prep(dentry, NULL);
+	spin_unlock(&configfs_dirent_lock);
+
+	configfs_detach_group(&group->cg_item);
+	d_inode(dentry)->i_flags |= S_DEAD;
+	dont_mount(dentry);
+	d_delete(dentry);
+	mutex_unlock(&d_inode(parent)->i_mutex);
+
+	dput(dentry);
+
+	mutex_lock(&subsys->su_mutex);
+	unlink_group(group);
+	mutex_unlock(&subsys->su_mutex);
+}
+EXPORT_SYMBOL(configfs_unregister_group);
+
+/**
+ * configfs_register_default_group() - allocates and registers a child group
+ * @parent_group:	parent group
+ * @name:		child group name
+ * @item_type:		child item type description
+ *
+ * boilerplate to allocate and register a child group with its parent. We need
+ * kzalloc'ed memory because child's default_group is initially empty.
+ *
+ * Return: allocated config group or ERR_PTR() on error
+ */
+struct config_group *
+configfs_register_default_group(struct config_group *parent_group,
+				const char *name,
+				struct config_item_type *item_type)
+{
+	int ret;
+	struct config_group *group;
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+	config_group_init_type_name(group, name, item_type);
+
+	ret = configfs_register_group(parent_group, group);
+	if (ret) {
+		kfree(group);
+		return ERR_PTR(ret);
+	}
+	return group;
+}
+EXPORT_SYMBOL(configfs_register_default_group);
+
+/**
+ * configfs_unregister_default_group() - unregisters and frees a child group
+ * @group:	the group to act on
+ */
+void configfs_unregister_default_group(struct config_group *group)
+{
+	configfs_unregister_group(group);
+	kfree(group);
+}
+EXPORT_SYMBOL(configfs_unregister_default_group);
+
 int configfs_register_subsystem(struct configfs_subsystem *subsys)
 {
 	int err;
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index a8a335b7fce0..758a029011b1 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -197,6 +197,16 @@ static inline struct configfs_subsystem *to_configfs_subsystem(struct config_gro
 int configfs_register_subsystem(struct configfs_subsystem *subsys);
 void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
 
+int configfs_register_group(struct config_group *parent_group,
+			    struct config_group *group);
+void configfs_unregister_group(struct config_group *group);
+
+struct config_group *
+configfs_register_default_group(struct config_group *parent_group,
+				const char *name,
+				struct config_item_type *item_type);
+void configfs_unregister_default_group(struct config_group *group);
+
 /* These functions can sleep and can alloc with GFP_KERNEL */
 /* WARNING: These cannot be called underneath configfs callbacks!! */
 int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target);
-- 
cgit v1.2.3-71-gd317


From 9d8a765211335cfdad464b90fb19f546af5706ae Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Fri, 20 Nov 2015 15:57:21 -0800
Subject: kernel/signal.c: unexport sigsuspend()

sigsuspend() is nowhere used except in signal.c itself, so we can mark it
static do not pollute the global namespace.

But this patch is more than a boring cleanup patch, it fixes a real issue
on UserModeLinux.  UML has a special console driver to display ttys using
xterm, or other terminal emulators, on the host side.  Vegard reported
that sometimes UML is unable to spawn a xterm and he's facing the
following warning:

  WARNING: CPU: 0 PID: 908 at include/linux/thread_info.h:128 sigsuspend+0xab/0xc0()

It turned out that this warning makes absolutely no sense as the UML
xterm code calls sigsuspend() on the host side, at least it tries.  But
as the kernel itself offers a sigsuspend() symbol the linker choose this
one instead of the glibc wrapper.  Interestingly this code used to work
since ever but always blocked signals on the wrong side.  Some recent
kernel change made the WARN_ON() trigger and uncovered the bug.

It is a wonderful example of how much works by chance on computers. :-)

Fixes: 68f3f16d9ad0f1 ("new helper: sigsuspend()")
Signed-off-by: Richard Weinberger <richard@nod.at>
Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Tested-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: <stable@vger.kernel.org>	[3.5+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h | 1 -
 kernel/signal.c        | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index ab1e0392b5ac..92557bbce7e7 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -239,7 +239,6 @@ extern int sigprocmask(int, sigset_t *, sigset_t *);
 extern void set_current_blocked(sigset_t *);
 extern void __set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
-extern int sigsuspend(sigset_t *);
 
 struct sigaction {
 #ifndef __ARCH_HAS_IRIX_SIGACTION
diff --git a/kernel/signal.c b/kernel/signal.c
index c0b01fe24bbd..f3f1f7a972fd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3503,7 +3503,7 @@ SYSCALL_DEFINE0(pause)
 
 #endif
 
-int sigsuspend(sigset_t *set)
+static int sigsuspend(sigset_t *set)
 {
 	current->saved_sigmask = current->blocked;
 	set_current_blocked(set);
-- 
cgit v1.2.3-71-gd317


From 21fa8442799945beaca074cb5bcf7cfe24969d59 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Fri, 20 Nov 2015 15:57:32 -0800
Subject: mm: fix up sparse warning in gfpflags_allow_blocking

sparse says:

    include/linux/gfp.h:274:26: warning: incorrect type in return expression (different base types)
    include/linux/gfp.h:274:26:    expected bool
    include/linux/gfp.h:274:26:    got restricted gfp_t

...add a forced cast to silence the warning.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 6523109e136d..8942af0813e3 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -271,7 +271,7 @@ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
 
 static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
 {
-	return gfp_flags & __GFP_DIRECT_RECLAIM;
+	return (bool __force)(gfp_flags & __GFP_DIRECT_RECLAIM);
 }
 
 #ifdef CONFIG_HIGHMEM
-- 
cgit v1.2.3-71-gd317


From 6b2a3d628aa752f0ab825fc6d4d07b09e274d1c1 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sun, 8 Nov 2015 08:52:31 -0500
Subject: tty: audit: Fix audit source
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The data to audit/record is in the 'from' buffer (ie., the input
read buffer).

Fixes: 72586c6061ab ("n_tty: Fix auditing support for cannonical mode")
Cc: stable <stable@vger.kernel.org> # 4.1+
Cc: Miloslav Trmač <mitr@redhat.com>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Laura Abbott <labbott@fedoraproject.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c     | 2 +-
 drivers/tty/tty_audit.c | 2 +-
 include/linux/tty.h     | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 13844261cd5f..ed776149261e 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -169,7 +169,7 @@ static inline int tty_copy_to_user(struct tty_struct *tty,
 {
 	struct n_tty_data *ldata = tty->disc_data;
 
-	tty_audit_add_data(tty, to, n, ldata->icanon);
+	tty_audit_add_data(tty, from, n, ldata->icanon);
 	return copy_to_user(to, from, n);
 }
 
diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c
index 90ca082935f6..3d245cd3d8e6 100644
--- a/drivers/tty/tty_audit.c
+++ b/drivers/tty/tty_audit.c
@@ -265,7 +265,7 @@ static struct tty_audit_buf *tty_audit_buf_get(struct tty_struct *tty,
  *
  *	Audit @data of @size from @tty, if necessary.
  */
-void tty_audit_add_data(struct tty_struct *tty, unsigned char *data,
+void tty_audit_add_data(struct tty_struct *tty, const void *data,
 			size_t size, unsigned icanon)
 {
 	struct tty_audit_buf *buf;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 5b04b0a5375b..5e31f1b99037 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -607,7 +607,7 @@ extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops);
 
 /* tty_audit.c */
 #ifdef CONFIG_AUDIT
-extern void tty_audit_add_data(struct tty_struct *tty, unsigned char *data,
+extern void tty_audit_add_data(struct tty_struct *tty, const void *data,
 			       size_t size, unsigned icanon);
 extern void tty_audit_exit(void);
 extern void tty_audit_fork(struct signal_struct *sig);
@@ -615,8 +615,8 @@ extern void tty_audit_tiocsti(struct tty_struct *tty, char ch);
 extern void tty_audit_push(struct tty_struct *tty);
 extern int tty_audit_push_current(void);
 #else
-static inline void tty_audit_add_data(struct tty_struct *tty,
-		unsigned char *data, size_t size, unsigned icanon)
+static inline void tty_audit_add_data(struct tty_struct *tty, const void *data,
+				      size_t size, unsigned icanon)
 {
 }
 static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch)
-- 
cgit v1.2.3-71-gd317


From 865762a8119e74b5f0e236d2d8eaaf8be9292a06 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 20 Nov 2015 15:57:58 -0800
Subject: slab/slub: adjust kmem_cache_alloc_bulk API

Adjust kmem_cache_alloc_bulk API before we have any real users.

Adjust API to return type 'int' instead of previously type 'bool'.  This
is done to allow future extension of the bulk alloc API.

A future extension could be to allow SLUB to stop at a page boundary, when
specified by a flag, and then return the number of objects.

The advantage of this approach, would make it easier to make bulk alloc
run without local IRQs disabled.  With an approach of cmpxchg "stealing"
the entire c->freelist or page->freelist.  To avoid overshooting we would
stop processing at a slab-page boundary.  Else we always end up returning
some objects at the cost of another cmpxchg.

To keep compatible with future users of this API linking against an older
kernel when using the new flag, we need to return the number of allocated
objects with this API change.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 2 +-
 mm/slab.c            | 2 +-
 mm/slab.h            | 2 +-
 mm/slab_common.c     | 6 +++---
 mm/slob.c            | 2 +-
 mm/slub.c            | 8 ++++----
 6 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 96940772bb92..2037a861e367 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -316,7 +316,7 @@ void kmem_cache_free(struct kmem_cache *, void *);
  * Note that interrupts must be enabled when calling these functions.
  */
 void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
-bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
diff --git a/mm/slab.c b/mm/slab.c
index e0819fa96559..4765c97ce690 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3419,7 +3419,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 }
 EXPORT_SYMBOL(kmem_cache_free_bulk);
 
-bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 								void **p)
 {
 	return __kmem_cache_alloc_bulk(s, flags, size, p);
diff --git a/mm/slab.h b/mm/slab.h
index 27492eb678f7..7b6087197997 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -170,7 +170,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
  * may be allocated or freed using these operations.
  */
 void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
-bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
 #ifdef CONFIG_MEMCG_KMEM
 /*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d88e97c10a2e..3c6a86b4ec25 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -112,7 +112,7 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
 		kmem_cache_free(s, p[i]);
 }
 
-bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
+int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 								void **p)
 {
 	size_t i;
@@ -121,10 +121,10 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 		void *x = p[i] = kmem_cache_alloc(s, flags);
 		if (!x) {
 			__kmem_cache_free_bulk(s, i, p);
-			return false;
+			return 0;
 		}
 	}
-	return true;
+	return i;
 }
 
 #ifdef CONFIG_MEMCG_KMEM
diff --git a/mm/slob.c b/mm/slob.c
index 0d7e5df74d1f..17e8f8cc7c53 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -617,7 +617,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 }
 EXPORT_SYMBOL(kmem_cache_free_bulk);
 
-bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 								void **p)
 {
 	return __kmem_cache_alloc_bulk(s, flags, size, p);
diff --git a/mm/slub.c b/mm/slub.c
index 34847044dfe5..46997517406e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2909,8 +2909,8 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
 EXPORT_SYMBOL(kmem_cache_free_bulk);
 
 /* Note that interrupts must be enabled when calling this function. */
-bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
-			   void **p)
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+			  void **p)
 {
 	struct kmem_cache_cpu *c;
 	int i;
@@ -2959,12 +2959,12 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 
 	/* memcg and kmem_cache debug support */
 	slab_post_alloc_hook(s, flags, size, p);
-	return true;
+	return i;
 error:
 	local_irq_enable();
 	slab_post_alloc_hook(s, flags, i, p);
 	__kmem_cache_free_bulk(s, i, p);
-	return false;
+	return 0;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
 
-- 
cgit v1.2.3-71-gd317


From c86b3de8c8b02d7e474fdc002c8df533b844524c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 17 Nov 2015 17:48:52 +0100
Subject: thermal: fix thermal_zone_bind_cooling_device prototype

When the prototype for thermal_zone_bind_cooling_device
changed, the static inline wrapper function was left alone,
which in theory can cause build warnings:

I have seen this error in the past:
drivers/thermal/db8500_thermal.c: In function 'db8500_cdev_bind':
drivers/thermal/db8500_thermal.c:78:9: error: too many arguments to function 'thermal_zone_bind_cooling_device'
   ret = thermal_zone_bind_cooling_device(thermal, i, cdev,

while this one no longer shows up, there is no doubt that
the prototype is still wrong, so let's just fix it anyway.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 6cd9e9f629f1 ("thermal: of: fix cooling device weights in device tree")
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 include/linux/thermal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 4014a59828fc..613c29bd6baf 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -438,7 +438,8 @@ static inline void thermal_zone_device_unregister(
 static inline int thermal_zone_bind_cooling_device(
 	struct thermal_zone_device *tz, int trip,
 	struct thermal_cooling_device *cdev,
-	unsigned long upper, unsigned long lower)
+	unsigned long upper, unsigned long lower,
+	unsigned int weight)
 { return -ENODEV; }
 static inline int thermal_zone_unbind_cooling_device(
 	struct thermal_zone_device *tz, int trip,
-- 
cgit v1.2.3-71-gd317


From 91ab4b4d16e6649fbbf65f303c0c4e20ed680bd1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Thu, 19 Nov 2015 14:30:26 -0500
Subject: nfs: use sliding delay when LAYOUTGET gets NFS4ERR_DELAY

When LAYOUTGET gets NFS4ERR_DELAY, we currently will wait 15s before
retrying the call. That is a _very_ long time, so add a timeout value to
struct nfs4_layoutget and pass nfs4_async_handle_error a pointer to it.
This allows the RPC engine to use a sliding delay window, instead of a
15s delay.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 2 +-
 include/linux/nfs_xdr.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 765a03559363..89818036f035 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7866,7 +7866,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 			spin_unlock(&inode->i_lock);
 		goto out_restart;
 	}
-	if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
+	if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN)
 		goto out_restart;
 out:
 	dprintk("<-- %s\n", __func__);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 570d630f98ae..11bbae44f4cb 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -251,6 +251,7 @@ struct nfs4_layoutget {
 	struct nfs4_layoutget_res res;
 	struct rpc_cred *cred;
 	gfp_t gfp_flags;
+	long timeout;
 };
 
 struct nfs4_getdeviceinfo_args {
-- 
cgit v1.2.3-71-gd317


From fbc416ff86183e2203cdf975e2881d7c164b0271 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Nov 2015 12:12:21 +0100
Subject: arm64: fix building without CONFIG_UID16

As reported by Michal Simek, building an ARM64 kernel with CONFIG_UID16
disabled currently fails because the system call table still needs to
reference the individual function entry points that are provided by
kernel/sys_ni.c in this case, and the declarations are hidden inside
of #ifdef CONFIG_UID16:

arch/arm64/include/asm/unistd32.h:57:8: error: 'sys_lchown16' undeclared here (not in a function)
 __SYSCALL(__NR_lchown, sys_lchown16)

I believe this problem only exists on ARM64, because older architectures
tend to not need declarations when their system call table is built
in assembly code, while newer architectures tend to not need UID16
support. ARM64 only uses these system calls for compatibility with
32-bit ARM binaries.

This changes the CONFIG_UID16 check into CONFIG_HAVE_UID16, which is
set unconditionally on ARM64 with CONFIG_COMPAT, so we see the
declarations whenever we need them, but otherwise the behavior is
unchanged.

Fixes: af1839eb4bd4 ("Kconfig: clean up the long arch list for the UID16 config option")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: stable@vger.kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/syscalls.h | 2 +-
 include/linux/types.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a156b82dd14c..c2b66a277e98 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -524,7 +524,7 @@ asmlinkage long sys_chown(const char __user *filename,
 asmlinkage long sys_lchown(const char __user *filename,
 				uid_t user, gid_t group);
 asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group);
-#ifdef CONFIG_UID16
+#ifdef CONFIG_HAVE_UID16
 asmlinkage long sys_chown16(const char __user *filename,
 				old_uid_t user, old_gid_t group);
 asmlinkage long sys_lchown16(const char __user *filename,
diff --git a/include/linux/types.h b/include/linux/types.h
index 70d8500bddf1..70dd3dfde631 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -35,7 +35,7 @@ typedef __kernel_gid16_t        gid16_t;
 
 typedef unsigned long		uintptr_t;
 
-#ifdef CONFIG_UID16
+#ifdef CONFIG_HAVE_UID16
 /* This is defined by include/asm-{arch}/posix_types.h */
 typedef __kernel_old_uid_t	old_uid_t;
 typedef __kernel_old_gid_t	old_gid_t;
-- 
cgit v1.2.3-71-gd317


From 9458ceab49179b7fd2d5192fd9dcf316ca195dc0 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 24 Nov 2015 15:30:21 -0800
Subject: net: phy: bcm7xxx: Add entry for Broadcom BCM7435

Add a PHY entry for the Broadcom BCM7435 chips, this is a 40nm
generation Ethernet PHY which is analogous to its 7425 and 7429 counter
parts.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm7xxx.c | 14 ++++++++++++++
 include/linux/brcmphy.h   |  1 +
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c
index 03d4809a9126..d4083c381cd1 100644
--- a/drivers/net/phy/bcm7xxx.c
+++ b/drivers/net/phy/bcm7xxx.c
@@ -360,6 +360,19 @@ static struct phy_driver bcm7xxx_driver[] = {
 	.suspend        = bcm7xxx_suspend,
 	.resume         = bcm7xxx_config_init,
 	.driver         = { .owner = THIS_MODULE },
+}, {
+	.phy_id         = PHY_ID_BCM7435,
+	.phy_id_mask    = 0xfffffff0,
+	.name           = "Broadcom BCM7435",
+	.features       = PHY_GBIT_FEATURES |
+			  SUPPORTED_Pause | SUPPORTED_Asym_Pause,
+	.flags          = PHY_IS_INTERNAL,
+	.config_init    = bcm7xxx_config_init,
+	.config_aneg    = genphy_config_aneg,
+	.read_status    = genphy_read_status,
+	.suspend        = bcm7xxx_suspend,
+	.resume         = bcm7xxx_config_init,
+	.driver         = { .owner = THIS_MODULE },
 }, {
 	.phy_id		= PHY_BCM_OUI_4,
 	.phy_id_mask	= 0xffff0000,
@@ -395,6 +408,7 @@ static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = {
 	{ PHY_ID_BCM7425, 0xfffffff0, },
 	{ PHY_ID_BCM7429, 0xfffffff0, },
 	{ PHY_ID_BCM7439, 0xfffffff0, },
+	{ PHY_ID_BCM7435, 0xfffffff0, },
 	{ PHY_ID_BCM7445, 0xfffffff0, },
 	{ PHY_BCM_OUI_4, 0xffff0000 },
 	{ PHY_BCM_OUI_5, 0xffffff00 },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 59f4a7304419..f0ba9c2ec639 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -26,6 +26,7 @@
 #define PHY_ID_BCM7366			0x600d8490
 #define PHY_ID_BCM7425			0x600d86b0
 #define PHY_ID_BCM7429			0x600d8730
+#define PHY_ID_BCM7435			0x600d8750
 #define PHY_ID_BCM7439			0x600d8480
 #define PHY_ID_BCM7439_2		0xae025080
 #define PHY_ID_BCM7445			0x600d8510
-- 
cgit v1.2.3-71-gd317


From c9da161c6517ba12154059d3b965c2cbaf16f90f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 24 Nov 2015 21:28:15 +0100
Subject: bpf: fix clearing on persistent program array maps

Currently, when having map file descriptors pointing to program arrays,
there's still the issue that we unconditionally flush program array
contents via bpf_fd_array_map_clear() in bpf_map_release(). This happens
when such a file descriptor is released and is independent of the map's
refcount.

Having this flush independent of the refcount is for a reason: there
can be arbitrary complex dependency chains among tail calls, also circular
ones (direct or indirect, nesting limit determined during runtime), and
we need to make sure that the map drops all references to eBPF programs
it holds, so that the map's refcount can eventually drop to zero and
initiate its freeing. Btw, a walk of the whole dependency graph would
not be possible for various reasons, one being complexity and another
one inconsistency, i.e. new programs can be added to parts of the graph
at any time, so there's no guaranteed consistent state for the time of
such a walk.

Now, the program array pinning itself works, but the issue is that each
derived file descriptor on close would nevertheless call unconditionally
into bpf_fd_array_map_clear(). Instead, keep track of users and postpone
this flush until the last reference to a user is dropped. As this only
concerns a subset of references (f.e. a prog array could hold a program
that itself has reference on the prog array holding it, etc), we need to
track them separately.

Short analysis on the refcounting: on map creation time usercnt will be
one, so there's no change in behaviour for bpf_map_release(), if unpinned.
If we already fail in map_create(), we are immediately freed, and no
file descriptor has been made public yet. In bpf_obj_pin_user(), we need
to probe for a possible map in bpf_fd_probe_obj() already with a usercnt
reference, so before we drop the reference on the fd with fdput().
Therefore, if actual pinning fails, we need to drop that reference again
in bpf_any_put(), otherwise we keep holding it. When last reference
drops on the inode, the bpf_any_put() in bpf_evict_inode() will take
care of dropping the usercnt again. In the bpf_obj_get_user() case, the
bpf_any_get() will grab a reference on the usercnt, still at a time when
we have the reference on the path. Should we later on fail to grab a new
file descriptor, bpf_any_put() will drop it, otherwise we hold it until
bpf_map_release() time.

Joint work with Alexei.

Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  5 ++++-
 kernel/bpf/inode.c    |  6 +++---
 kernel/bpf/syscall.c  | 36 +++++++++++++++++++++++++-----------
 kernel/bpf/verifier.c |  3 +--
 4 files changed, 33 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index de464e6683b6..83d1926c61e4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -40,6 +40,7 @@ struct bpf_map {
 	struct user_struct *user;
 	const struct bpf_map_ops *ops;
 	struct work_struct work;
+	atomic_t usercnt;
 };
 
 struct bpf_map_type_list {
@@ -167,8 +168,10 @@ struct bpf_prog *bpf_prog_get(u32 ufd);
 void bpf_prog_put(struct bpf_prog *prog);
 void bpf_prog_put_rcu(struct bpf_prog *prog);
 
-struct bpf_map *bpf_map_get(u32 ufd);
+struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
+void bpf_map_inc(struct bpf_map *map, bool uref);
+void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 
 extern int sysctl_unprivileged_bpf_disabled;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index be6d726e31c9..5a8a797d50b7 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -34,7 +34,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
 		atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
 		break;
 	case BPF_TYPE_MAP:
-		atomic_inc(&((struct bpf_map *)raw)->refcnt);
+		bpf_map_inc(raw, true);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -51,7 +51,7 @@ static void bpf_any_put(void *raw, enum bpf_type type)
 		bpf_prog_put(raw);
 		break;
 	case BPF_TYPE_MAP:
-		bpf_map_put(raw);
+		bpf_map_put_with_uref(raw);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -64,7 +64,7 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
 	void *raw;
 
 	*type = BPF_TYPE_MAP;
-	raw = bpf_map_get(ufd);
+	raw = bpf_map_get_with_uref(ufd);
 	if (IS_ERR(raw)) {
 		*type = BPF_TYPE_PROG;
 		raw = bpf_prog_get(ufd);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0d3313d02a7e..4a8f3c1d7da6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -82,6 +82,14 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	map->ops->map_free(map);
 }
 
+static void bpf_map_put_uref(struct bpf_map *map)
+{
+	if (atomic_dec_and_test(&map->usercnt)) {
+		if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+			bpf_fd_array_map_clear(map);
+	}
+}
+
 /* decrement map refcnt and schedule it for freeing via workqueue
  * (unrelying map implementation ops->map_free() might sleep)
  */
@@ -93,17 +101,15 @@ void bpf_map_put(struct bpf_map *map)
 	}
 }
 
-static int bpf_map_release(struct inode *inode, struct file *filp)
+void bpf_map_put_with_uref(struct bpf_map *map)
 {
-	struct bpf_map *map = filp->private_data;
-
-	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
-		/* prog_array stores refcnt-ed bpf_prog pointers
-		 * release them all when user space closes prog_array_fd
-		 */
-		bpf_fd_array_map_clear(map);
-
+	bpf_map_put_uref(map);
 	bpf_map_put(map);
+}
+
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+	bpf_map_put_with_uref(filp->private_data);
 	return 0;
 }
 
@@ -142,6 +148,7 @@ static int map_create(union bpf_attr *attr)
 		return PTR_ERR(map);
 
 	atomic_set(&map->refcnt, 1);
+	atomic_set(&map->usercnt, 1);
 
 	err = bpf_map_charge_memlock(map);
 	if (err)
@@ -174,7 +181,14 @@ struct bpf_map *__bpf_map_get(struct fd f)
 	return f.file->private_data;
 }
 
-struct bpf_map *bpf_map_get(u32 ufd)
+void bpf_map_inc(struct bpf_map *map, bool uref)
+{
+	atomic_inc(&map->refcnt);
+	if (uref)
+		atomic_inc(&map->usercnt);
+}
+
+struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_map *map;
@@ -183,7 +197,7 @@ struct bpf_map *bpf_map_get(u32 ufd)
 	if (IS_ERR(map))
 		return map;
 
-	atomic_inc(&map->refcnt);
+	bpf_map_inc(map, true);
 	fdput(f);
 
 	return map;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c6073056badf..a7945d10b378 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2021,8 +2021,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 			 * will be used by the valid program until it's unloaded
 			 * and all maps are released in free_bpf_prog_info()
 			 */
-			atomic_inc(&map->refcnt);
-
+			bpf_map_inc(map, false);
 			fdput(f);
 next_insn:
 			insn++;
-- 
cgit v1.2.3-71-gd317


From 7c7a0e945349a3d0d497d7f32db6ed33d4031110 Mon Sep 17 00:00:00 2001
From: Gabriele Paoloni <gabriele.paoloni@huawei.com>
Date: Wed, 11 Nov 2015 09:12:25 +0800
Subject: ARM/PCI: Move align_resource function pointer to pci_host_bridge
 structure

Commit b3a72384fe29 ("ARM/PCI: Replace pci_sys_data->align_resource with
global function pointer") introduced an ARM-specific align_resource()
function pointer.  This is not portable to other arches and doesn't work
for platforms with two different PCIe host bridge controllers.

Move the function pointer to the pci_host_bridge structure so each host
bridge driver can specify its own align_resource() function.

Signed-off-by: Gabriele Paoloni <gabriele.paoloni@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/kernel/bios32.c | 19 +++++++++++--------
 drivers/pci/pci.h        |  2 --
 include/linux/pci.h      |  9 +++++++++
 3 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c
index 6551d28c27e6..066f7f9ba411 100644
--- a/arch/arm/kernel/bios32.c
+++ b/arch/arm/kernel/bios32.c
@@ -17,11 +17,6 @@
 #include <asm/mach/pci.h>
 
 static int debug_pci;
-static resource_size_t (*align_resource)(struct pci_dev *dev,
-		  const struct resource *res,
-		  resource_size_t start,
-		  resource_size_t size,
-		  resource_size_t align) = NULL;
 
 /*
  * We can't use pci_get_device() here since we are
@@ -461,7 +456,6 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw,
 		sys->busnr   = busnr;
 		sys->swizzle = hw->swizzle;
 		sys->map_irq = hw->map_irq;
-		align_resource = hw->align_resource;
 		INIT_LIST_HEAD(&sys->resources);
 
 		if (hw->private_data)
@@ -470,6 +464,8 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw,
 		ret = hw->setup(nr, sys);
 
 		if (ret > 0) {
+			struct pci_host_bridge *host_bridge;
+
 			ret = pcibios_init_resources(nr, sys);
 			if (ret)  {
 				kfree(sys);
@@ -491,6 +487,9 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw,
 			busnr = sys->bus->busn_res.end + 1;
 
 			list_add(&sys->node, head);
+
+			host_bridge = pci_find_host_bridge(sys->bus);
+			host_bridge->align_resource = hw->align_resource;
 		} else {
 			kfree(sys);
 			if (ret < 0)
@@ -578,14 +577,18 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
 {
 	struct pci_dev *dev = data;
 	resource_size_t start = res->start;
+	struct pci_host_bridge *host_bridge;
 
 	if (res->flags & IORESOURCE_IO && start & 0x300)
 		start = (start + 0x3ff) & ~0x3ff;
 
 	start = (start + align - 1) & ~(align - 1);
 
-	if (align_resource)
-		return align_resource(dev, res, start, size, align);
+	host_bridge = pci_find_host_bridge(dev->bus);
+
+	if (host_bridge->align_resource)
+		return host_bridge->align_resource(dev, res,
+				start, size, align);
 
 	return start;
 }
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index fd2f03fa53f3..d390fc1475ec 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -337,6 +337,4 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe)
 }
 #endif
 
-struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus);
-
 #endif /* DRIVERS_PCI_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e828e7b4afec..6ae25aae88fd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -412,9 +412,18 @@ struct pci_host_bridge {
 	void (*release_fn)(struct pci_host_bridge *);
 	void *release_data;
 	unsigned int ignore_reset_delay:1;	/* for entire hierarchy */
+	/* Resource alignment requirements */
+	resource_size_t (*align_resource)(struct pci_dev *dev,
+			const struct resource *res,
+			resource_size_t start,
+			resource_size_t size,
+			resource_size_t align);
 };
 
 #define	to_pci_host_bridge(n) container_of(n, struct pci_host_bridge, dev)
+
+struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus);
+
 void pci_set_host_bridge_release(struct pci_host_bridge *bridge,
 		     void (*release_fn)(struct pci_host_bridge *),
 		     void *release_data);
-- 
cgit v1.2.3-71-gd317


From 3a66d7dca186ebdef9b0bf55e216778fa598062c Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 22 Oct 2015 16:02:14 -0700
Subject: kref: Remove kref_put_spinlock_irqsave()

The last user is gone. Hence remove this function.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Joern Engel <joern@logfs.org>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 include/linux/kref.h | 33 ---------------------------------
 1 file changed, 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 484604d184be..e15828fd71f1 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -19,7 +19,6 @@
 #include <linux/atomic.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
-#include <linux/spinlock.h>
 
 struct kref {
 	atomic_t refcount;
@@ -99,38 +98,6 @@ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)
 	return kref_sub(kref, 1, release);
 }
 
-/**
- * kref_put_spinlock_irqsave - decrement refcount for object.
- * @kref: object.
- * @release: pointer to the function that will clean up the object when the
- *	     last reference to the object is released.
- *	     This pointer is required, and it is not acceptable to pass kfree
- *	     in as this function.
- * @lock: lock to take in release case
- *
- * Behaves identical to kref_put with one exception.  If the reference count
- * drops to zero, the lock will be taken atomically wrt dropping the reference
- * count.  The release function has to call spin_unlock() without _irqrestore.
- */
-static inline int kref_put_spinlock_irqsave(struct kref *kref,
-		void (*release)(struct kref *kref),
-		spinlock_t *lock)
-{
-	unsigned long flags;
-
-	WARN_ON(release == NULL);
-	if (atomic_add_unless(&kref->refcount, -1, 1))
-		return 0;
-	spin_lock_irqsave(lock, flags);
-	if (atomic_dec_and_test(&kref->refcount)) {
-		release(kref);
-		local_irq_restore(flags);
-		return 1;
-	}
-	spin_unlock_irqrestore(lock, flags);
-	return 0;
-}
-
 static inline int kref_put_mutex(struct kref *kref,
 				 void (*release)(struct kref *kref),
 				 struct mutex *lock)
-- 
cgit v1.2.3-71-gd317


From 08236c6bb2980561fba657c58fdc76f2865f236c Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Sat, 28 Nov 2015 16:49:27 +0100
Subject: lightnvm: unconverted ppa returned in get_bb_tbl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The get_bb_tbl function takes ppa as a generic address, which is
converted to the ppa device address within the device driver. When
the update_bbtbl callback is called from get_bb_tbl, the device
specific ppa is used, instead of the generic ppa.

Make sure to pass the generic ppa.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c    | 3 +--
 drivers/nvme/host/lightnvm.c | 4 +++-
 include/linux/lightnvm.h     | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 3969a9875e59..35dde84b71e9 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -75,7 +75,6 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks,
 	struct nvm_block *blk;
 	int i;
 
-	ppa = dev_to_generic_addr(gn->dev, ppa);
 	lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun];
 
 	for (i = 0; i < nr_blocks; i++) {
@@ -187,7 +186,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn)
 			ppa.g.lun = lun->vlun.id;
 			ppa = generic_to_dev_addr(dev, ppa);
 
-			ret = dev->ops->get_bb_tbl(dev->q, ppa,
+			ret = dev->ops->get_bb_tbl(dev, ppa,
 						dev->blks_per_lun,
 						gennvm_block_bb, gn);
 			if (ret)
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index b9e5cc74053f..06c336410235 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -355,10 +355,11 @@ out:
 	return ret;
 }
 
-static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa,
+static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 				int nr_blocks, nvm_bb_update_fn *update_bbtbl,
 				void *priv)
 {
+	struct request_queue *q = nvmdev->q;
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_nvm_command c = {};
@@ -402,6 +403,7 @@ static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa,
 		goto out;
 	}
 
+	ppa = dev_to_generic_addr(nvmdev, ppa);
 	ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv);
 	if (ret) {
 		ret = -EINTR;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 3db5552b17d5..c6916aec43b6 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -179,7 +179,7 @@ typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *);
 typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *);
 typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32,
 				nvm_l2p_update_fn *, void *);
-typedef int (nvm_op_bb_tbl_fn)(struct request_queue *, struct ppa_addr, int,
+typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int,
 				nvm_bb_update_fn *, void *);
 typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int);
 typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *);
-- 
cgit v1.2.3-71-gd317


From bf4e6b4e757488dee1b6a581f49c7ac34cd217f8 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 26 Nov 2015 08:46:57 +0100
Subject: block: Always check queue limits for cloned requests

When a cloned request is retried on other queues it always needs
to be checked against the queue limits of that queue.
Otherwise the calculations for nr_phys_segments might be wrong,
leading to a crash in scsi_init_sgtable().

To clarify this the patch renames blk_rq_check_limits()
to blk_cloned_rq_check_limits() and removes the symbol
export, as the new function should only be used for
cloned requests and never exported.

Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Ewan Milne <emilne@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Fixes: e2a60da74 ("block: Clean up special command handling logic")
Cc: stable@vger.kernel.org # 3.7+
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 21 +++++++--------------
 include/linux/blkdev.h |  1 -
 2 files changed, 7 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 5131993b23a1..a0af4043dda2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2114,7 +2114,8 @@ blk_qc_t submit_bio(int rw, struct bio *bio)
 EXPORT_SYMBOL(submit_bio);
 
 /**
- * blk_rq_check_limits - Helper function to check a request for the queue limit
+ * blk_cloned_rq_check_limits - Helper function to check a cloned request
+ *                              for new the queue limits
  * @q:  the queue
  * @rq: the request being checked
  *
@@ -2125,20 +2126,13 @@ EXPORT_SYMBOL(submit_bio);
  *    after it is inserted to @q, it should be checked against @q before
  *    the insertion using this generic function.
  *
- *    This function should also be useful for request stacking drivers
- *    in some cases below, so export this function.
  *    Request stacking drivers like request-based dm may change the queue
- *    limits while requests are in the queue (e.g. dm's table swapping).
- *    Such request stacking drivers should check those requests against
- *    the new queue limits again when they dispatch those requests,
- *    although such checkings are also done against the old queue limits
- *    when submitting requests.
+ *    limits when retrying requests on other queues. Those requests need
+ *    to be checked against the new queue limits again during dispatch.
  */
-int blk_rq_check_limits(struct request_queue *q, struct request *rq)
+static int blk_cloned_rq_check_limits(struct request_queue *q,
+				      struct request *rq)
 {
-	if (!rq_mergeable(rq))
-		return 0;
-
 	if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
@@ -2158,7 +2152,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 
 /**
  * blk_insert_cloned_request - Helper for stacking drivers to submit a request
@@ -2170,7 +2163,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	unsigned long flags;
 	int where = ELEVATOR_INSERT_BACK;
 
-	if (blk_rq_check_limits(q, rq))
+	if (blk_cloned_rq_check_limits(q, rq))
 		return -EIO;
 
 	if (rq->rq_disk &&
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c0d2b7927c1f..c06f8eaa42ff 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -773,7 +773,6 @@ extern void blk_rq_set_block_pc(struct request *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
-extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 			     struct bio_set *bs, gfp_t gfp_mask,
-- 
cgit v1.2.3-71-gd317


From 880621c2605b82eb5af91a2c94223df6f5a3fb64 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sun, 22 Nov 2015 17:46:09 +0100
Subject: packet: Allow packets with only a header (but no payload)

Commit 9c7077622dd91 ("packet: make packet_snd fail on len smaller
than l2 header") added validation for the packet size in packet_snd.
This change enforces that every packet needs a header (with at least
hard_header_len bytes) plus a payload with at least one byte. Before
this change the payload was optional.

This fixes PPPoE connections which do not have a "Service" or
"Host-Uniq" configured (which is violating the spec, but is still
widely used in real-world setups). Those are currently failing with the
following message: "pppd: packet size is too short (24 <= 24)"

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 ++-
 net/packet/af_packet.c    | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 67bfac1abfc1..3b5d134e945a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1398,7 +1398,8 @@ enum netdev_priv_flags {
  *	@dma:		DMA channel
  *	@mtu:		Interface MTU value
  *	@type:		Interface hardware type
- *	@hard_header_len: Hardware header length
+ *	@hard_header_len: Hardware header length, which means that this is the
+ *			  minimum size of a packet.
  *
  *	@needed_headroom: Extra headroom the hardware may need, but not in all
  *			  cases can this be guaranteed
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1cf928fb573e..992396aa635c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2329,8 +2329,8 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
 static bool ll_header_truncated(const struct net_device *dev, int len)
 {
 	/* net device doesn't like empty head */
-	if (unlikely(len <= dev->hard_header_len)) {
-		net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
+	if (unlikely(len < dev->hard_header_len)) {
+		net_warn_ratelimited("%s: packet size is too short (%d < %d)\n",
 				     current->comm, len, dev->hard_header_len);
 		return true;
 	}
-- 
cgit v1.2.3-71-gd317


From 1ce0bf50ae2233c7115a18c0c623662d177b434c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 26 Nov 2015 13:55:39 +0800
Subject: net: Generalise wq_has_sleeper helper

The memory barrier in the helper wq_has_sleeper is needed by just
about every user of waitqueue_active.  This patch generalises it
by making it take a wait_queue_head_t directly.  The existing
helper is renamed to skwq_has_sleeper.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/algif_aead.c     |  4 ++--
 crypto/algif_skcipher.c |  4 ++--
 include/linux/wait.h    | 21 +++++++++++++++++++++
 include/net/sock.h      | 15 +++++----------
 net/atm/common.c        |  4 ++--
 net/core/sock.c         |  8 ++++----
 net/core/stream.c       |  2 +-
 net/dccp/output.c       |  2 +-
 net/iucv/af_iucv.c      |  2 +-
 net/rxrpc/af_rxrpc.c    |  2 +-
 net/sctp/socket.c       |  2 +-
 net/tipc/socket.c       |  4 ++--
 net/unix/af_unix.c      |  2 +-
 13 files changed, 44 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 0aa6fdfb448a..fb99f30849d2 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -106,7 +106,7 @@ static void aead_wmem_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
 							   POLLRDNORM |
 							   POLLRDBAND);
@@ -157,7 +157,7 @@ static void aead_data_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
 							   POLLRDNORM |
 							   POLLRDBAND);
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index af31a0ee4057..0e6702e41472 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -238,7 +238,7 @@ static void skcipher_wmem_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
 							   POLLRDNORM |
 							   POLLRDBAND);
@@ -288,7 +288,7 @@ static void skcipher_data_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
 							   POLLRDNORM |
 							   POLLRDBAND);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a9..6aa09a875fbd 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -107,6 +107,27 @@ static inline int waitqueue_active(wait_queue_head_t *q)
 	return !list_empty(&q->task_list);
 }
 
+/**
+ * wq_has_sleeper - check if there are any waiting processes
+ * @wq: wait queue head
+ *
+ * Returns true if wq has waiting processes
+ *
+ * Please refer to the comment for waitqueue_active.
+ */
+static inline bool wq_has_sleeper(wait_queue_head_t *wq)
+{
+	/*
+	 * We need to be sure we are in sync with the
+	 * add_wait_queue modifications to the wait queue.
+	 *
+	 * This memory barrier should be paired with one on the
+	 * waiting side.
+	 */
+	smp_mb();
+	return waitqueue_active(wq);
+}
+
 extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
 extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
 extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
diff --git a/include/net/sock.h b/include/net/sock.h
index 7f89e4ba18d1..62d35afcb3ac 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -58,6 +58,7 @@
 #include <linux/memcontrol.h>
 #include <linux/static_key.h>
 #include <linux/sched.h>
+#include <linux/wait.h>
 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
@@ -1879,12 +1880,12 @@ static inline bool sk_has_allocations(const struct sock *sk)
 }
 
 /**
- * wq_has_sleeper - check if there are any waiting processes
+ * skwq_has_sleeper - check if there are any waiting processes
  * @wq: struct socket_wq
  *
  * Returns true if socket_wq has waiting processes
  *
- * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory
+ * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
  * barrier call. They were added due to the race found within the tcp code.
  *
  * Consider following tcp code paths:
@@ -1910,15 +1911,9 @@ static inline bool sk_has_allocations(const struct sock *sk)
  * data on the socket.
  *
  */
-static inline bool wq_has_sleeper(struct socket_wq *wq)
+static inline bool skwq_has_sleeper(struct socket_wq *wq)
 {
-	/* We need to be sure we are in sync with the
-	 * add_wait_queue modifications to the wait queue.
-	 *
-	 * This memory barrier is paired in the sock_poll_wait.
-	 */
-	smp_mb();
-	return wq && waitqueue_active(&wq->wait);
+	return wq && wq_has_sleeper(&wq->wait);
 }
 
 /**
diff --git a/net/atm/common.c b/net/atm/common.c
index 49a872db7e42..6dc12305799e 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -96,7 +96,7 @@ static void vcc_def_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up(&wq->wait);
 	rcu_read_unlock();
 }
@@ -117,7 +117,7 @@ static void vcc_write_space(struct sock *sk)
 
 	if (vcc_writable(sk)) {
 		wq = rcu_dereference(sk->sk_wq);
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible(&wq->wait);
 
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
diff --git a/net/core/sock.c b/net/core/sock.c
index 1e4dd54bfb5a..2769bd3a4d7c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2283,7 +2283,7 @@ static void sock_def_wakeup(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_all(&wq->wait);
 	rcu_read_unlock();
 }
@@ -2294,7 +2294,7 @@ static void sock_def_error_report(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_poll(&wq->wait, POLLERR);
 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
 	rcu_read_unlock();
@@ -2306,7 +2306,7 @@ static void sock_def_readable(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
 						POLLRDNORM | POLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
@@ -2324,7 +2324,7 @@ static void sock_def_write_space(struct sock *sk)
 	 */
 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
 		wq = rcu_dereference(sk->sk_wq);
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 
diff --git a/net/core/stream.c b/net/core/stream.c
index d70f77a0c889..8ff9d63b4265 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -35,7 +35,7 @@ void sk_stream_write_space(struct sock *sk)
 
 		rcu_read_lock();
 		wq = rcu_dereference(sk->sk_wq);
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 4ce912e691d0..b66c84db0766 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -201,7 +201,7 @@ void dccp_write_space(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible(&wq->wait);
 	/* Should agree with poll, otherwise some programs break */
 	if (sock_writeable(sk))
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index fcb2752419c6..4f0aa91470c6 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -303,7 +303,7 @@ static void iucv_sock_wake_msglim(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_all(&wq->wait);
 	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	rcu_read_unlock();
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 1f8a144a5dc2..7e2d1057d8bc 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -67,7 +67,7 @@ static void rxrpc_write_space(struct sock *sk)
 	if (rxrpc_writable(sk)) {
 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
 
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible(&wq->wait);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 897c01c029ca..ec10b66354b8 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6978,7 +6978,7 @@ void sctp_data_ready(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
 						POLLRDNORM | POLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 552dbaba9cf3..525acf6dd1c6 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1492,7 +1492,7 @@ static void tipc_write_space(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 	rcu_read_unlock();
@@ -1509,7 +1509,7 @@ static void tipc_data_ready(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (wq_has_sleeper(wq))
+	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
 						POLLRDNORM | POLLRDBAND);
 	rcu_read_unlock();
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 955ec152cb71..efb706e1d1c0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -339,7 +339,7 @@ static void unix_write_space(struct sock *sk)
 	rcu_read_lock();
 	if (unix_writable(sk)) {
 		wq = rcu_dereference(sk->sk_wq);
-		if (wq_has_sleeper(wq))
+		if (skwq_has_sleeper(wq))
 			wake_up_interruptible_sync_poll(&wq->wait,
 				POLLOUT | POLLWRNORM | POLLWRBAND);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
-- 
cgit v1.2.3-71-gd317


From 06bd6c0370bb88a2256c6763a32bc4e4ade06521 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 26 Nov 2015 15:23:45 +0100
Subject: net: ipmr: remove unused MFC_NOTIFY flag and make the flags enum

MFC_NOTIFY was introduced in kernel 2.1.68 but afaik it hasn't been used
and I couldn't find any users currently so just remove it. Only
MFC_STATIC is left, so move it into an enum, add a description and use
BIT().

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 10 +++++++---
 net/ipv4/ipmr.c        |  2 --
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 79aaa9fc1a15..fa66ebc1fed6 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -64,6 +64,13 @@ struct vif_device {
 
 #define VIFF_STATIC 0x8000
 
+/* mfc_flags:
+ * MFC_STATIC - the entry was added statically (not by a routing daemon)
+ */
+enum {
+	MFC_STATIC = BIT(0),
+};
+
 struct mfc_cache {
 	struct list_head list;
 	__be32 mfc_mcastgrp;			/* Group the entry belongs to 	*/
@@ -89,9 +96,6 @@ struct mfc_cache {
 	struct rcu_head	rcu;
 };
 
-#define MFC_STATIC		1
-#define MFC_NOTIFY		2
-
 #define MFC_LINES		64
 
 #ifdef __BIG_ENDIAN
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a2d248d9c35c..a74e61883b8f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2199,8 +2199,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
 	}
 
 	read_lock(&mrt_lock);
-	if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
-		cache->mfc_flags |= MFC_NOTIFY;
 	err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
 	read_unlock(&mrt_lock);
 	rcu_read_unlock();
-- 
cgit v1.2.3-71-gd317


From 520191bb404c4b7b4cdb70a5480ada974b0c2d60 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 26 Nov 2015 15:23:46 +0100
Subject: net: ipmr: adjust mroute.h style and drop extern

Remove extra spaces and tabs, adjust function definitions, remove an
unnecessary ifdef (already used below, just move code) and drop extern
from the functions.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 47 +++++++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index fa66ebc1fed6..7c567a2679ce 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -9,38 +9,28 @@
 #ifdef CONFIG_IP_MROUTE
 static inline int ip_mroute_opt(int opt)
 {
-	return (opt >= MRT_BASE) && (opt <= MRT_MAX);
+	return opt >= MRT_BASE && opt <= MRT_MAX;
 }
-#else
-static inline int ip_mroute_opt(int opt)
-{
-	return 0;
-}
-#endif
 
-#ifdef CONFIG_IP_MROUTE
-extern int ip_mroute_setsockopt(struct sock *, int, char __user *, unsigned int);
-extern int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
-extern int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
-extern int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
-extern int ip_mr_init(void);
+int ip_mroute_setsockopt(struct sock *, int, char __user *, unsigned int);
+int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
+int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
+int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
+int ip_mr_init(void);
 #else
-static inline
-int ip_mroute_setsockopt(struct sock *sock,
-			 int optname, char __user *optval, unsigned int optlen)
+static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
+				       char __user *optval, unsigned int optlen)
 {
 	return -ENOPROTOOPT;
 }
 
-static inline
-int ip_mroute_getsockopt(struct sock *sock,
-			 int optname, char __user *optval, int __user *optlen)
+static inline int ip_mroute_getsockopt(struct sock *sock, int optname,
+				       char __user *optval, int __user *optlen)
 {
 	return -ENOPROTOOPT;
 }
 
-static inline
-int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
+static inline int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
 {
 	return -ENOIOCTLCMD;
 }
@@ -49,6 +39,11 @@ static inline int ip_mr_init(void)
 {
 	return 0;
 }
+
+static inline int ip_mroute_opt(int opt)
+{
+	return 0;
+}
 #endif
 
 struct vif_device {
@@ -96,16 +91,16 @@ struct mfc_cache {
 	struct rcu_head	rcu;
 };
 
-#define MFC_LINES		64
+#define MFC_LINES 64
 
 #ifdef __BIG_ENDIAN
 #define MFC_HASH(a,b)	(((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1))
 #else
 #define MFC_HASH(a,b)	((((__force u32)(__be32)a)^(((__force u32)(__be32)b)>>2))&(MFC_LINES-1))
-#endif		
+#endif
 
 struct rtmsg;
-extern int ipmr_get_route(struct net *net, struct sk_buff *skb,
-			  __be32 saddr, __be32 daddr,
-			  struct rtmsg *rtm, int nowait);
+int ipmr_get_route(struct net *net, struct sk_buff *skb,
+		   __be32 saddr, __be32 daddr,
+		   struct rtmsg *rtm, int nowait);
 #endif
-- 
cgit v1.2.3-71-gd317


From 5ea1f13299d8b8edcb2969eda4c81f8e3264b706 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 26 Nov 2015 15:23:47 +0100
Subject: net: ipmr: move struct mr_table and VIF_EXISTS to mroute.h

Move the definitions of VIF_EXISTS() and struct mr_table to mroute.h

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 21 +++++++++++++++++++--
 net/ipv4/ipmr.c        | 18 ------------------
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 7c567a2679ce..bf9b322cb0b0 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -59,6 +59,25 @@ struct vif_device {
 
 #define VIFF_STATIC 0x8000
 
+#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
+#define MFC_LINES 64
+
+struct mr_table {
+	struct list_head	list;
+	possible_net_t		net;
+	u32			id;
+	struct sock __rcu	*mroute_sk;
+	struct timer_list	ipmr_expire_timer;
+	struct list_head	mfc_unres_queue;
+	struct list_head	mfc_cache_array[MFC_LINES];
+	struct vif_device	vif_table[MAXVIFS];
+	int			maxvif;
+	atomic_t		cache_resolve_queue_len;
+	bool			mroute_do_assert;
+	bool			mroute_do_pim;
+	int			mroute_reg_vif_num;
+};
+
 /* mfc_flags:
  * MFC_STATIC - the entry was added statically (not by a routing daemon)
  */
@@ -91,8 +110,6 @@ struct mfc_cache {
 	struct rcu_head	rcu;
 };
 
-#define MFC_LINES 64
-
 #ifdef __BIG_ENDIAN
 #define MFC_HASH(a,b)	(((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1))
 #else
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a74e61883b8f..ff3dbbb9f11c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -67,22 +67,6 @@
 #include <net/fib_rules.h>
 #include <linux/netconf.h>
 
-struct mr_table {
-	struct list_head	list;
-	possible_net_t		net;
-	u32			id;
-	struct sock __rcu	*mroute_sk;
-	struct timer_list	ipmr_expire_timer;
-	struct list_head	mfc_unres_queue;
-	struct list_head	mfc_cache_array[MFC_LINES];
-	struct vif_device	vif_table[MAXVIFS];
-	int			maxvif;
-	atomic_t		cache_resolve_queue_len;
-	bool			mroute_do_assert;
-	bool			mroute_do_pim;
-	int			mroute_reg_vif_num;
-};
-
 struct ipmr_rule {
 	struct fib_rule		common;
 };
@@ -104,8 +88,6 @@ static DEFINE_RWLOCK(mrt_lock);
 
 /* Multicast router control variables */
 
-#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
-
 /* Special spinlock for queue of unresolved entries */
 static DEFINE_SPINLOCK(mfc_unres_lock);
 
-- 
cgit v1.2.3-71-gd317


From 1973a4ea6ceaa47671227c3077f90508ea30897b Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 26 Nov 2015 15:23:48 +0100
Subject: net: ipmr: move pimsm_enabled to pim.h and rename

Move the inline pimsm_enabled() to pim.h and rename it to
ipmr_pimsm_enabled to show it's for the ipv4 ipmr code since pim.h is
used by IPv6 too.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pim.h |  5 +++++
 net/ipv4/ipmr.c     | 11 +++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pim.h b/include/linux/pim.h
index 252bf6644c51..e1d756f81348 100644
--- a/include/linux/pim.h
+++ b/include/linux/pim.h
@@ -13,6 +13,11 @@
 
 #define PIM_NULL_REGISTER	cpu_to_be32(0x40000000)
 
+static inline bool ipmr_pimsm_enabled(void)
+{
+	return IS_BUILTIN(CONFIG_IP_PIMSM_V1) || IS_BUILTIN(CONFIG_IP_PIMSM_V2);
+}
+
 /* PIMv2 register message header layout (ietf-draft-idmr-pimvsm-v2-00.ps */
 struct pimreghdr
 {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ff3dbbb9f11c..322fdc6ac75b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -75,11 +75,6 @@ struct ipmr_result {
 	struct mr_table		*mrt;
 };
 
-static inline bool pimsm_enabled(void)
-{
-	return IS_BUILTIN(CONFIG_IP_PIMSM_V1) || IS_BUILTIN(CONFIG_IP_PIMSM_V2);
-}
-
 /* Big lock, protecting vif table, mrt cache and mroute socket state.
  * Note that the changes are semaphored via rtnl_lock.
  */
@@ -751,7 +746,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 
 	switch (vifc->vifc_flags) {
 	case VIFF_REGISTER:
-		if (!pimsm_enabled())
+		if (!ipmr_pimsm_enabled())
 			return -EINVAL;
 		/* Special Purpose VIF in PIM
 		 * All the packets will be sent to the daemon
@@ -1377,7 +1372,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
 		mrt->mroute_do_assert = val;
 		break;
 	case MRT_PIM:
-		if (!pimsm_enabled()) {
+		if (!ipmr_pimsm_enabled()) {
 			ret = -ENOPROTOOPT;
 			break;
 		}
@@ -1451,7 +1446,7 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
 		val = 0x0305;
 		break;
 	case MRT_PIM:
-		if (!pimsm_enabled())
+		if (!ipmr_pimsm_enabled())
 			return -ENOPROTOOPT;
 		val = mrt->mroute_do_pim;
 		break;
-- 
cgit v1.2.3-71-gd317


From 9cd3e072b0be17446e37d7414eac8a3499e0601e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 29 Nov 2015 20:03:10 -0800
Subject: net: rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA

This patch is a cleanup to make following patch easier to
review.

Goal is to move SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA
from (struct socket)->flags to a (struct socket_wq)->flags
to benefit from RCU protection in sock_wake_async()

To ease backports, we rename both constants.

Two new helpers, sk_set_bit(int nr, struct sock *sk)
and sk_clear_bit(int net, struct sock *sk) are added so that
following patch can change their implementation.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/algif_aead.c          |  4 ++--
 crypto/algif_skcipher.c      |  6 +++---
 drivers/net/macvtap.c        |  4 ++--
 drivers/net/tun.c            |  4 ++--
 fs/dlm/lowcomms.c            |  4 ++--
 include/linux/net.h          |  6 +++---
 include/net/sock.h           | 10 ++++++++++
 net/bluetooth/af_bluetooth.c |  6 +++---
 net/caif/caif_socket.c       |  4 ++--
 net/core/datagram.c          |  2 +-
 net/core/sock.c              |  8 ++++----
 net/core/stream.c            |  4 ++--
 net/dccp/proto.c             |  3 +--
 net/decnet/af_decnet.c       |  8 ++++----
 net/ipv4/tcp.c               |  7 +++----
 net/iucv/af_iucv.c           |  2 +-
 net/nfc/llcp_sock.c          |  2 +-
 net/rxrpc/ar-output.c        |  2 +-
 net/sctp/socket.c            |  2 +-
 net/socket.c                 |  4 ++--
 net/sunrpc/xprtsock.c        | 14 +++++++-------
 net/unix/af_unix.c           |  6 +++---
 22 files changed, 60 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 0aa6fdfb448a..6d4d4569447e 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -125,7 +125,7 @@ static int aead_wait_for_data(struct sock *sk, unsigned flags)
 	if (flags & MSG_DONTWAIT)
 		return -EAGAIN;
 
-	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 
 	for (;;) {
 		if (signal_pending(current))
@@ -139,7 +139,7 @@ static int aead_wait_for_data(struct sock *sk, unsigned flags)
 	}
 	finish_wait(sk_sleep(sk), &wait);
 
-	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 
 	return err;
 }
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index af31a0ee4057..ca9efe17db1a 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -212,7 +212,7 @@ static int skcipher_wait_for_wmem(struct sock *sk, unsigned flags)
 	if (flags & MSG_DONTWAIT)
 		return -EAGAIN;
 
-	set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	for (;;) {
 		if (signal_pending(current))
@@ -258,7 +258,7 @@ static int skcipher_wait_for_data(struct sock *sk, unsigned flags)
 		return -EAGAIN;
 	}
 
-	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 
 	for (;;) {
 		if (signal_pending(current))
@@ -272,7 +272,7 @@ static int skcipher_wait_for_data(struct sock *sk, unsigned flags)
 	}
 	finish_wait(sk_sleep(sk), &wait);
 
-	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 
 	return err;
 }
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 54036ae0a388..0fc521941c71 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -498,7 +498,7 @@ static void macvtap_sock_write_space(struct sock *sk)
 	wait_queue_head_t *wqueue;
 
 	if (!sock_writeable(sk) ||
-	    !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
+	    !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
 		return;
 
 	wqueue = sk_sleep(sk);
@@ -585,7 +585,7 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait)
 		mask |= POLLIN | POLLRDNORM;
 
 	if (sock_writeable(&q->sk) ||
-	    (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
+	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) &&
 	     sock_writeable(&q->sk)))
 		mask |= POLLOUT | POLLWRNORM;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b1878faea397..f0db770e8b2f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1040,7 +1040,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 		mask |= POLLIN | POLLRDNORM;
 
 	if (sock_writeable(sk) ||
-	    (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
 	     sock_writeable(sk)))
 		mask |= POLLOUT | POLLWRNORM;
 
@@ -1488,7 +1488,7 @@ static void tun_sock_write_space(struct sock *sk)
 	if (!sock_writeable(sk))
 		return;
 
-	if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
+	if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
 		return;
 
 	wqueue = sk_sleep(sk);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 87e9d796cf7d..3a37bd3f9637 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -421,7 +421,7 @@ static void lowcomms_write_space(struct sock *sk)
 
 	if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
 		con->sock->sk->sk_write_pending--;
-		clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+		clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
 	}
 
 	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
@@ -1448,7 +1448,7 @@ static void send_to_sock(struct connection *con)
 					      msg_flags);
 			if (ret == -EAGAIN || ret == 0) {
 				if (ret == -EAGAIN &&
-				    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+				    test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
 				    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
 					/* Notify TCP that we're limited by the
 					 * application window size.
diff --git a/include/linux/net.h b/include/linux/net.h
index 70ac5e28e6b7..f514e4dd5521 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -34,8 +34,8 @@ struct inode;
 struct file;
 struct net;
 
-#define SOCK_ASYNC_NOSPACE	0
-#define SOCK_ASYNC_WAITDATA	1
+#define SOCKWQ_ASYNC_NOSPACE	0
+#define SOCKWQ_ASYNC_WAITDATA	1
 #define SOCK_NOSPACE		2
 #define SOCK_PASSCRED		3
 #define SOCK_PASSSEC		4
@@ -96,7 +96,7 @@ struct socket_wq {
  *  struct socket - general BSD socket
  *  @state: socket state (%SS_CONNECTED, etc)
  *  @type: socket type (%SOCK_STREAM, etc)
- *  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
+ *  @flags: socket flags (%SOCK_NOSPACE, etc)
  *  @ops: protocol specific socket operations
  *  @file: File back pointer for gc
  *  @sk: internal networking protocol agnostic socket representation
diff --git a/include/net/sock.h b/include/net/sock.h
index 7f89e4ba18d1..c155d09d8af4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2005,6 +2005,16 @@ static inline unsigned long sock_wspace(struct sock *sk)
 	return amt;
 }
 
+static inline void sk_set_bit(int nr, struct sock *sk)
+{
+	set_bit(nr, &sk->sk_socket->flags);
+}
+
+static inline void sk_clear_bit(int nr, struct sock *sk)
+{
+	clear_bit(nr, &sk->sk_socket->flags);
+}
+
 static inline void sk_wake_async(struct sock *sk, int how, int band)
 {
 	if (sock_flag(sk, SOCK_FASYNC))
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index a3bffd1ec2b4..70306cc9d814 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -271,11 +271,11 @@ static long bt_sock_data_wait(struct sock *sk, long timeo)
 		if (signal_pending(current) || !timeo)
 			break;
 
-		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		release_sock(sk);
 		timeo = schedule_timeout(timeo);
 		lock_sock(sk);
-		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	}
 
 	__set_current_state(TASK_RUNNING);
@@ -441,7 +441,7 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock,
 	if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk))
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	return mask;
 }
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index cc858919108e..aa209b1066c9 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -323,7 +323,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo)
 			!timeo)
 			break;
 
-		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		release_sock(sk);
 		timeo = schedule_timeout(timeo);
 		lock_sock(sk);
@@ -331,7 +331,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo)
 		if (sock_flag(sk, SOCK_DEAD))
 			break;
 
-		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	}
 
 	finish_wait(sk_sleep(sk), &wait);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 617088aee21d..d62af69ad844 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -785,7 +785,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 	if (sock_writeable(sk))
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	return mask;
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index 1e4dd54bfb5a..9d79569935a3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1815,7 +1815,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
 {
 	DEFINE_WAIT(wait);
 
-	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 	for (;;) {
 		if (!timeo)
 			break;
@@ -1861,7 +1861,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
 			break;
 
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		err = -EAGAIN;
 		if (!timeo)
@@ -2048,9 +2048,9 @@ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
 	DEFINE_WAIT(wait);
 
 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
-	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	finish_wait(sk_sleep(sk), &wait);
 	return rc;
 }
diff --git a/net/core/stream.c b/net/core/stream.c
index d70f77a0c889..43309428644d 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -126,7 +126,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 		current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
 
 	while (1) {
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
@@ -139,7 +139,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 		}
 		if (signal_pending(current))
 			goto do_interrupted;
-		clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 		if (sk_stream_memory_free(sk) && !vm_wait)
 			break;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index b5cf13a28009..41e65804ddf5 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -339,8 +339,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
 			if (sk_stream_is_writeable(sk)) {
 				mask |= POLLOUT | POLLWRNORM;
 			} else {  /* send SIGIO later */
-				set_bit(SOCK_ASYNC_NOSPACE,
-					&sk->sk_socket->flags);
+				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 
 				/* Race breaker. If space is freed after
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 675cf94e04f8..eebf5ac8ce18 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1747,9 +1747,9 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 		}
 
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target));
-		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		finish_wait(sk_sleep(sk), &wait);
 	}
 
@@ -2004,10 +2004,10 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 			}
 
 			prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-			set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+			sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 			sk_wait_event(sk, &timeo,
 				      !dn_queue_too_long(scp, queue, flags));
-			clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+			sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 			finish_wait(sk_sleep(sk), &wait);
 			continue;
 		}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c1728771cf89..c82cca18c90f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -517,8 +517,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 			if (sk_stream_is_writeable(sk)) {
 				mask |= POLLOUT | POLLWRNORM;
 			} else {  /* send SIGIO later */
-				set_bit(SOCK_ASYNC_NOSPACE,
-					&sk->sk_socket->flags);
+				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 
 				/* Race breaker. If space is freed after
@@ -906,7 +905,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 			goto out_err;
 	}
 
-	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	mss_now = tcp_send_mss(sk, &size_goal, flags);
 	copied = 0;
@@ -1134,7 +1133,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	}
 
 	/* This should be in poll */
-	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	mss_now = tcp_send_mss(sk, &size_goal, flags);
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index fcb2752419c6..435608c4306d 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1483,7 +1483,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock,
 	if (sock_writeable(sk) && iucv_below_msglim(sk))
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	return mask;
 }
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index b7de0da46acd..ecf0a0196f18 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -572,7 +572,7 @@ static unsigned int llcp_sock_poll(struct file *file, struct socket *sock,
 	if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED)
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	pr_debug("mask 0x%x\n", mask);
 
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index a40d3afe93b7..14c4e12c47b0 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -531,7 +531,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
 	/* this should be in poll */
-	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 		return -EPIPE;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 897c01c029ca..2353985d689c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6458,7 +6458,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	if (sctp_writeable(sk)) {
 		mask |= POLLOUT | POLLWRNORM;
 	} else {
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 		/*
 		 * Since the socket is not locked, the buffer
 		 * might be made available after the writeable check and
diff --git a/net/socket.c b/net/socket.c
index dd2c247c99e3..16be908205fc 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1072,11 +1072,11 @@ int sock_wake_async(struct socket *sock, int how, int band)
 	}
 	switch (how) {
 	case SOCK_WAKE_WAITD:
-		if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
+		if (test_bit(SOCKWQ_ASYNC_WAITDATA, &sock->flags))
 			break;
 		goto call_kill;
 	case SOCK_WAKE_SPACE:
-		if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
+		if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags))
 			break;
 		/* fall through */
 	case SOCK_WAKE_IO:
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 1d1a70498910..2ffaf6a79499 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -398,7 +398,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+	clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags);
 	if (base != 0) {
 		addr = NULL;
 		addrlen = 0;
@@ -442,7 +442,7 @@ static void xs_nospace_callback(struct rpc_task *task)
 	struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
 
 	transport->inet->sk_write_pending--;
-	clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+	clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
 }
 
 /**
@@ -467,7 +467,7 @@ static int xs_nospace(struct rpc_task *task)
 
 	/* Don't race with disconnect */
 	if (xprt_connected(xprt)) {
-		if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
+		if (test_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags)) {
 			/*
 			 * Notify TCP that we're limited by the application
 			 * window size
@@ -478,7 +478,7 @@ static int xs_nospace(struct rpc_task *task)
 			xprt_wait_for_buffer_space(task, xs_nospace_callback);
 		}
 	} else {
-		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
 		ret = -ENOTCONN;
 	}
 
@@ -626,7 +626,7 @@ process_status:
 	case -EPERM:
 		/* When the server has died, an ICMP port unreachable message
 		 * prompts ECONNREFUSED. */
-		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
 	}
 
 	return status;
@@ -715,7 +715,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	case -EADDRINUSE:
 	case -ENOBUFS:
 	case -EPIPE:
-		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
 	}
 
 	return status;
@@ -1618,7 +1618,7 @@ static void xs_write_space(struct sock *sk)
 
 	if (unlikely(!(xprt = xprt_from_sock(sk))))
 		return;
-	if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0)
+	if (test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags) == 0)
 		return;
 
 	xprt_write_space(xprt);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 6ced74690eee..45aebd966978 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2191,7 +2191,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
 		    !timeo)
 			break;
 
-		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 		unix_state_unlock(sk);
 		timeo = freezable_schedule_timeout(timeo);
 		unix_state_lock(sk);
@@ -2199,7 +2199,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
 		if (sock_flag(sk, SOCK_DEAD))
 			break;
 
-		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	}
 
 	finish_wait(sk_sleep(sk), &wait);
@@ -2683,7 +2683,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 	if (writable)
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
-		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
 	return mask;
 }
-- 
cgit v1.2.3-71-gd317


From ceb5d58b217098a657f3850b7a2640f995032e62 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 29 Nov 2015 20:03:11 -0800
Subject: net: fix sock_wake_async() rcu protection

Dmitry provided a syzkaller (http://github.com/google/syzkaller)
triggering a fault in sock_wake_async() when async IO is requested.

Said program stressed af_unix sockets, but the issue is generic
and should be addressed in core networking stack.

The problem is that by the time sock_wake_async() is called,
we should not access the @flags field of 'struct socket',
as the inode containing this socket might be freed without
further notice, and without RCU grace period.

We already maintain an RCU protected structure, "struct socket_wq"
so moving SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA into it
is the safe route.

It also reduces number of cache lines needing dirtying, so might
provide a performance improvement anyway.

In followup patches, we might move remaining flags (SOCK_NOSPACE,
SOCK_PASSCRED, SOCK_PASSSEC) to save 8 bytes and let 'struct socket'
being mostly read and let it being shared between cpus.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  7 ++++++-
 include/net/sock.h  | 23 ++++++++++++++++-------
 net/core/stream.c   |  2 +-
 net/sctp/socket.c   | 24 ++++++++++++++----------
 net/socket.c        | 21 +++++++--------------
 5 files changed, 44 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index f514e4dd5521..0b4ac7da583a 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -34,6 +34,10 @@ struct inode;
 struct file;
 struct net;
 
+/* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located
+ * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected.
+ * Eventually all flags will be in sk->sk_wq_flags.
+ */
 #define SOCKWQ_ASYNC_NOSPACE	0
 #define SOCKWQ_ASYNC_WAITDATA	1
 #define SOCK_NOSPACE		2
@@ -89,6 +93,7 @@ struct socket_wq {
 	/* Note: wait MUST be first field of socket_wq */
 	wait_queue_head_t	wait;
 	struct fasync_struct	*fasync_list;
+	unsigned long		flags; /* %SOCKWQ_ASYNC_NOSPACE, etc */
 	struct rcu_head		rcu;
 } ____cacheline_aligned_in_smp;
 
@@ -202,7 +207,7 @@ enum {
 	SOCK_WAKE_URG,
 };
 
-int sock_wake_async(struct socket *sk, int how, int band);
+int sock_wake_async(struct socket_wq *sk_wq, int how, int band);
 int sock_register(const struct net_proto_family *fam);
 void sock_unregister(int family);
 int __sock_create(struct net *net, int family, int type, int proto,
diff --git a/include/net/sock.h b/include/net/sock.h
index c155d09d8af4..0434138c5f95 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -384,8 +384,10 @@ struct sock {
 	int			sk_rcvbuf;
 
 	struct sk_filter __rcu	*sk_filter;
-	struct socket_wq __rcu	*sk_wq;
-
+	union {
+		struct socket_wq __rcu	*sk_wq;
+		struct socket_wq	*sk_wq_raw;
+	};
 #ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
 #endif
@@ -2005,20 +2007,27 @@ static inline unsigned long sock_wspace(struct sock *sk)
 	return amt;
 }
 
+/* Note:
+ *  We use sk->sk_wq_raw, from contexts knowing this
+ *  pointer is not NULL and cannot disappear/change.
+ */
 static inline void sk_set_bit(int nr, struct sock *sk)
 {
-	set_bit(nr, &sk->sk_socket->flags);
+	set_bit(nr, &sk->sk_wq_raw->flags);
 }
 
 static inline void sk_clear_bit(int nr, struct sock *sk)
 {
-	clear_bit(nr, &sk->sk_socket->flags);
+	clear_bit(nr, &sk->sk_wq_raw->flags);
 }
 
-static inline void sk_wake_async(struct sock *sk, int how, int band)
+static inline void sk_wake_async(const struct sock *sk, int how, int band)
 {
-	if (sock_flag(sk, SOCK_FASYNC))
-		sock_wake_async(sk->sk_socket, how, band);
+	if (sock_flag(sk, SOCK_FASYNC)) {
+		rcu_read_lock();
+		sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
+		rcu_read_unlock();
+	}
 }
 
 /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
diff --git a/net/core/stream.c b/net/core/stream.c
index 43309428644d..b96f7a79e544 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -39,7 +39,7 @@ void sk_stream_write_space(struct sock *sk)
 			wake_up_interruptible_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
-			sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+			sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
 		rcu_read_unlock();
 	}
 }
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 2353985d689c..5e35ef34008b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6801,26 +6801,30 @@ no_packet:
 static void __sctp_write_space(struct sctp_association *asoc)
 {
 	struct sock *sk = asoc->base.sk;
-	struct socket *sock = sk->sk_socket;
 
-	if ((sctp_wspace(asoc) > 0) && sock) {
-		if (waitqueue_active(&asoc->wait))
-			wake_up_interruptible(&asoc->wait);
+	if (sctp_wspace(asoc) <= 0)
+		return;
+
+	if (waitqueue_active(&asoc->wait))
+		wake_up_interruptible(&asoc->wait);
 
-		if (sctp_writeable(sk)) {
-			wait_queue_head_t *wq = sk_sleep(sk);
+	if (sctp_writeable(sk)) {
+		struct socket_wq *wq;
 
-			if (wq && waitqueue_active(wq))
-				wake_up_interruptible(wq);
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq) {
+			if (waitqueue_active(&wq->wait))
+				wake_up_interruptible(&wq->wait);
 
 			/* Note that we try to include the Async I/O support
 			 * here by modeling from the current TCP/UDP code.
 			 * We have not tested with it yet.
 			 */
 			if (!(sk->sk_shutdown & SEND_SHUTDOWN))
-				sock_wake_async(sock,
-						SOCK_WAKE_SPACE, POLL_OUT);
+				sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
 		}
+		rcu_read_unlock();
 	}
 }
 
diff --git a/net/socket.c b/net/socket.c
index 16be908205fc..456fadb3d819 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1056,27 +1056,20 @@ static int sock_fasync(int fd, struct file *filp, int on)
 	return 0;
 }
 
-/* This function may be called only under socket lock or callback_lock or rcu_lock */
+/* This function may be called only under rcu_lock */
 
-int sock_wake_async(struct socket *sock, int how, int band)
+int sock_wake_async(struct socket_wq *wq, int how, int band)
 {
-	struct socket_wq *wq;
-
-	if (!sock)
-		return -1;
-	rcu_read_lock();
-	wq = rcu_dereference(sock->wq);
-	if (!wq || !wq->fasync_list) {
-		rcu_read_unlock();
+	if (!wq || !wq->fasync_list)
 		return -1;
-	}
+
 	switch (how) {
 	case SOCK_WAKE_WAITD:
-		if (test_bit(SOCKWQ_ASYNC_WAITDATA, &sock->flags))
+		if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
 			break;
 		goto call_kill;
 	case SOCK_WAKE_SPACE:
-		if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags))
+		if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
 			break;
 		/* fall through */
 	case SOCK_WAKE_IO:
@@ -1086,7 +1079,7 @@ call_kill:
 	case SOCK_WAKE_URG:
 		kill_fasync(&wq->fasync_list, SIGURG, band);
 	}
-	rcu_read_unlock();
+
 	return 0;
 }
 EXPORT_SYMBOL(sock_wake_async);
-- 
cgit v1.2.3-71-gd317


From 91420b83baa046ada1a899c97f3b2c52a9045705 Mon Sep 17 00:00:00 2001
From: Sudarsana Kalluru <Sudarsana.Kalluru@qlogic.com>
Date: Mon, 30 Nov 2015 12:25:03 +0200
Subject: qed: Add support for changing LED state

Physical LEDs are being controlled by the management FW.
This adds the qed functionality required to request management FW to
change the LED configuration, as well as the necessary APIs for this
functionality to later be used by the protocol drivers.

Signed-off-by: Sudarsana Kalluru <Sudarsana.Kalluru@qlogic.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h  |  6 ++++++
 drivers/net/ethernet/qlogic/qed/qed_main.c | 18 ++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c  | 27 +++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h  | 13 +++++++++++++
 include/linux/qed/qed_if.h                 | 17 +++++++++++++++++
 5 files changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index b2f8e854dfd1..264e954675d1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -3993,6 +3993,8 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_PHY_CORE_WRITE             0x000e0000
 #define DRV_MSG_CODE_SET_VERSION                0x000f0000
 
+#define DRV_MSG_CODE_SET_LED_MODE               0x00200000
+
 #define DRV_MSG_SEQ_NUMBER_MASK                 0x0000ffff
 
 	u32 drv_mb_param;
@@ -4044,6 +4046,10 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_SHIFT   8
 #define DRV_MB_PARAM_CFG_VF_MSIX_SB_NUM_MASK    0x0000FF00
 
+#define DRV_MB_PARAM_SET_LED_MODE_OPER          0x0
+#define DRV_MB_PARAM_SET_LED_MODE_ON            0x1
+#define DRV_MB_PARAM_SET_LED_MODE_OFF           0x2
+
 	u32 fw_mb_header;
 #define FW_MSG_CODE_MASK                        0xffff0000
 #define FW_MSG_CODE_DRV_LOAD_ENGINE             0x10100000
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 947c7af72b25..6b02e1134360 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1135,6 +1135,23 @@ static int qed_drain(struct qed_dev *cdev)
 	return 0;
 }
 
+static int qed_set_led(struct qed_dev *cdev, enum qed_led_mode mode)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int status = 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	status = qed_mcp_set_led(hwfn, ptt, mode);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return status;
+}
+
 const struct qed_common_ops qed_common_ops_pass = {
 	.probe = &qed_probe,
 	.remove = &qed_remove,
@@ -1155,6 +1172,7 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.update_msglvl = &qed_init_dp,
 	.chain_alloc = &qed_chain_alloc,
 	.chain_free = &qed_chain_free,
+	.set_led = &qed_set_led,
 };
 
 u32 qed_get_protocol_version(enum qed_protocol protocol)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 20d048cdcb88..ba1b1f1ef789 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -858,3 +858,30 @@ qed_mcp_send_drv_version(struct qed_hwfn *p_hwfn,
 
 	return 0;
 }
+
+int qed_mcp_set_led(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+		    enum qed_led_mode mode)
+{
+	u32 resp = 0, param = 0, drv_mb_param;
+	int rc;
+
+	switch (mode) {
+	case QED_LED_MODE_ON:
+		drv_mb_param = DRV_MB_PARAM_SET_LED_MODE_ON;
+		break;
+	case QED_LED_MODE_OFF:
+		drv_mb_param = DRV_MB_PARAM_SET_LED_MODE_OFF;
+		break;
+	case QED_LED_MODE_RESTORE:
+		drv_mb_param = DRV_MB_PARAM_SET_LED_MODE_OPER;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Invalid LED mode %d\n", mode);
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_SET_LED_MODE,
+			 drv_mb_param, &resp, &param);
+
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index dbaae586b4a7..506197d5c3dd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -224,6 +224,19 @@ qed_mcp_send_drv_version(struct qed_hwfn *p_hwfn,
 			 struct qed_ptt *p_ptt,
 			 struct qed_mcp_drv_version *p_ver);
 
+/**
+ * @brief Set LED status
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param mode - LED mode
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_set_led(struct qed_hwfn *p_hwfn,
+		    struct qed_ptt *p_ptt,
+		    enum qed_led_mode mode);
+
 /* Using hwfn number (and not pf_num) is required since in CMT mode,
  * same pf_num may be used by two different hwfn
  * TODO - this shouldn't really be in .h file, but until all fields
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index dc9a1353f971..d4a32e878180 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -25,6 +25,12 @@
 #include <linux/qed/common_hsi.h>
 #include <linux/qed/qed_chain.h>
 
+enum qed_led_mode {
+	QED_LED_MODE_OFF,
+	QED_LED_MODE_ON,
+	QED_LED_MODE_RESTORE
+};
+
 #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \
 					    (void __iomem *)(reg_addr))
 
@@ -252,6 +258,17 @@ struct qed_common_ops {
 
 	void		(*chain_free)(struct qed_dev *cdev,
 				      struct qed_chain *p_chain);
+
+/**
+ * @brief set_led - Configure LED mode
+ *
+ * @param cdev
+ * @param mode - LED mode
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*set_led)(struct qed_dev *cdev,
+		       enum qed_led_mode mode);
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From 45f6fad84cc305103b28d73482b344d7f5b76f39 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 29 Nov 2015 19:37:57 -0800
Subject: ipv6: add complete rcu protection around np->opt

This patch addresses multiple problems :

UDP/RAW sendmsg() need to get a stable struct ipv6_txoptions
while socket is not locked : Other threads can change np->opt
concurrently. Dmitry posted a syzkaller
(http://github.com/google/syzkaller) program desmonstrating
use-after-free.

Starting with TCP/DCCP lockless listeners, tcp_v6_syn_recv_sock()
and dccp_v6_request_recv_sock() also need to use RCU protection
to dereference np->opt once (before calling ipv6_dup_options())

This patch adds full RCU protection to np->opt

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h             |  2 +-
 include/net/ipv6.h               | 21 ++++++++++++++++++++-
 net/dccp/ipv6.c                  | 33 +++++++++++++++++++++------------
 net/ipv6/af_inet6.c              | 13 +++++++++----
 net/ipv6/datagram.c              |  4 +++-
 net/ipv6/exthdrs.c               |  3 ++-
 net/ipv6/inet6_connection_sock.c | 11 ++++++++---
 net/ipv6/ipv6_sockglue.c         | 33 ++++++++++++++++++++++-----------
 net/ipv6/raw.c                   |  8 ++++++--
 net/ipv6/syncookies.c            |  2 +-
 net/ipv6/tcp_ipv6.c              | 28 +++++++++++++++++-----------
 net/ipv6/udp.c                   |  8 ++++++--
 net/l2tp/l2tp_ip6.c              |  8 ++++++--
 13 files changed, 122 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 0ef2a97ccdb5..402753bccafa 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -227,7 +227,7 @@ struct ipv6_pinfo {
 	struct ipv6_ac_socklist	*ipv6_ac_list;
 	struct ipv6_fl_socklist __rcu *ipv6_fl_list;
 
-	struct ipv6_txoptions	*opt;
+	struct ipv6_txoptions __rcu	*opt;
 	struct sk_buff		*pktoptions;
 	struct sk_buff		*rxpmtu;
 	struct inet6_cork	cork;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index ea5a13ef85a6..9a5c9f013784 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -205,6 +205,7 @@ extern rwlock_t ip6_ra_lock;
  */
 
 struct ipv6_txoptions {
+	atomic_t		refcnt;
 	/* Length of this structure */
 	int			tot_len;
 
@@ -217,7 +218,7 @@ struct ipv6_txoptions {
 	struct ipv6_opt_hdr	*dst0opt;
 	struct ipv6_rt_hdr	*srcrt;	/* Routing Header */
 	struct ipv6_opt_hdr	*dst1opt;
-
+	struct rcu_head		rcu;
 	/* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */
 };
 
@@ -252,6 +253,24 @@ struct ipv6_fl_socklist {
 	struct rcu_head			rcu;
 };
 
+static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
+{
+	struct ipv6_txoptions *opt;
+
+	rcu_read_lock();
+	opt = rcu_dereference(np->opt);
+	if (opt && !atomic_inc_not_zero(&opt->refcnt))
+		opt = NULL;
+	rcu_read_unlock();
+	return opt;
+}
+
+static inline void txopt_put(struct ipv6_txoptions *opt)
+{
+	if (opt && atomic_dec_and_test(&opt->refcnt))
+		kfree_rcu(opt, rcu);
+}
+
 struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label);
 struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
 					 struct ip6_flowlabel *fl,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index db5fc2440a23..e7e0b9bc2a43 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -202,7 +202,9 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
 	security_req_classify_flow(req, flowi6_to_flowi(&fl6));
 
 
-	final_p = fl6_update_dst(&fl6, np->opt, &final);
+	rcu_read_lock();
+	final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
+	rcu_read_unlock();
 
 	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 	if (IS_ERR(dst)) {
@@ -219,7 +221,10 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
 							 &ireq->ir_v6_loc_addr,
 							 &ireq->ir_v6_rmt_addr);
 		fl6.daddr = ireq->ir_v6_rmt_addr;
-		err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
+		rcu_read_lock();
+		err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
+			       np->tclass);
+		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
 
@@ -387,6 +392,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 	struct inet_request_sock *ireq = inet_rsk(req);
 	struct ipv6_pinfo *newnp;
 	const struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6_txoptions *opt;
 	struct inet_sock *newinet;
 	struct dccp6_sock *newdp6;
 	struct sock *newsk;
@@ -488,13 +494,15 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 	 * Yes, keeping reference count would be much more clever, but we make
 	 * one more one thing there: reattach optmem to newsk.
 	 */
-	if (np->opt != NULL)
-		newnp->opt = ipv6_dup_options(newsk, np->opt);
-
+	opt = rcu_dereference(np->opt);
+	if (opt) {
+		opt = ipv6_dup_options(newsk, opt);
+		RCU_INIT_POINTER(newnp->opt, opt);
+	}
 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
-	if (newnp->opt != NULL)
-		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
-						     newnp->opt->opt_flen);
+	if (opt)
+		inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
+						    opt->opt_flen;
 
 	dccp_sync_mss(newsk, dst_mtu(dst));
 
@@ -757,6 +765,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct in6_addr *saddr = NULL, *final_p, final;
+	struct ipv6_txoptions *opt;
 	struct flowi6 fl6;
 	struct dst_entry *dst;
 	int addr_type;
@@ -856,7 +865,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl6.fl6_sport = inet->inet_sport;
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-	final_p = fl6_update_dst(&fl6, np->opt, &final);
+	opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+	final_p = fl6_update_dst(&fl6, opt, &final);
 
 	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 	if (IS_ERR(dst)) {
@@ -876,9 +886,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	__ip6_dst_store(sk, dst, NULL, NULL);
 
 	icsk->icsk_ext_hdr_len = 0;
-	if (np->opt != NULL)
-		icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
-					  np->opt->opt_nflen);
+	if (opt)
+		icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen;
 
 	inet->inet_dport = usin->sin6_port;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 44bb66bde0e2..38d66ddfb937 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -428,9 +428,11 @@ void inet6_destroy_sock(struct sock *sk)
 
 	/* Free tx options */
 
-	opt = xchg(&np->opt, NULL);
-	if (opt)
-		sock_kfree_s(sk, opt, opt->tot_len);
+	opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL);
+	if (opt) {
+		atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+		txopt_put(opt);
+	}
 }
 EXPORT_SYMBOL_GPL(inet6_destroy_sock);
 
@@ -659,7 +661,10 @@ int inet6_sk_rebuild_header(struct sock *sk)
 		fl6.fl6_sport = inet->inet_sport;
 		security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-		final_p = fl6_update_dst(&fl6, np->opt, &final);
+		rcu_read_lock();
+		final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
+					 &final);
+		rcu_read_unlock();
 
 		dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 		if (IS_ERR(dst)) {
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index d70b0238f468..517c55b01ba8 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -167,8 +167,10 @@ ipv4_connected:
 
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-	opt = flowlabel ? flowlabel->opt : np->opt;
+	rcu_read_lock();
+	opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
 	final_p = fl6_update_dst(&fl6, opt, &final);
+	rcu_read_unlock();
 
 	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
 	err = 0;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index ce203b0402be..ea7c4d64a00a 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -727,6 +727,7 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
 			*((char **)&opt2->dst1opt) += dif;
 		if (opt2->srcrt)
 			*((char **)&opt2->srcrt) += dif;
+		atomic_set(&opt2->refcnt, 1);
 	}
 	return opt2;
 }
@@ -790,7 +791,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 		return ERR_PTR(-ENOBUFS);
 
 	memset(opt2, 0, tot_len);
-
+	atomic_set(&opt2->refcnt, 1);
 	opt2->tot_len = tot_len;
 	p = (char *)(opt2 + 1);
 
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 5d1c7cee2cb2..3ff5208772bb 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -78,7 +78,9 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
 	memset(fl6, 0, sizeof(*fl6));
 	fl6->flowi6_proto = proto;
 	fl6->daddr = ireq->ir_v6_rmt_addr;
-	final_p = fl6_update_dst(fl6, np->opt, &final);
+	rcu_read_lock();
+	final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+	rcu_read_unlock();
 	fl6->saddr = ireq->ir_v6_loc_addr;
 	fl6->flowi6_oif = ireq->ir_iif;
 	fl6->flowi6_mark = ireq->ir_mark;
@@ -142,7 +144,9 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
 	fl6->fl6_dport = inet->inet_dport;
 	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
 
-	final_p = fl6_update_dst(fl6, np->opt, &final);
+	rcu_read_lock();
+	final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+	rcu_read_unlock();
 
 	dst = __inet6_csk_dst_check(sk, np->dst_cookie);
 	if (!dst) {
@@ -175,7 +179,8 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
 	/* Restore final destination back after routing done */
 	fl6.daddr = sk->sk_v6_daddr;
 
-	res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
+	res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
+		       np->tclass);
 	rcu_read_unlock();
 	return res;
 }
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 63e6956917c9..4449ad1f8114 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -111,7 +111,8 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
 			icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		}
 	}
-	opt = xchg(&inet6_sk(sk)->opt, opt);
+	opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt,
+		   opt);
 	sk_dst_reset(sk);
 
 	return opt;
@@ -231,9 +232,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 				sk->sk_socket->ops = &inet_dgram_ops;
 				sk->sk_family = PF_INET;
 			}
-			opt = xchg(&np->opt, NULL);
-			if (opt)
-				sock_kfree_s(sk, opt, opt->tot_len);
+			opt = xchg((__force struct ipv6_txoptions **)&np->opt,
+				   NULL);
+			if (opt) {
+				atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+				txopt_put(opt);
+			}
 			pktopt = xchg(&np->pktoptions, NULL);
 			kfree_skb(pktopt);
 
@@ -403,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
 			break;
 
-		opt = ipv6_renew_options(sk, np->opt, optname,
+		opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+		opt = ipv6_renew_options(sk, opt, optname,
 					 (struct ipv6_opt_hdr __user *)optval,
 					 optlen);
 		if (IS_ERR(opt)) {
@@ -432,8 +437,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		retv = 0;
 		opt = ipv6_update_options(sk, opt);
 sticky_done:
-		if (opt)
-			sock_kfree_s(sk, opt, opt->tot_len);
+		if (opt) {
+			atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+			txopt_put(opt);
+		}
 		break;
 	}
 
@@ -486,6 +493,7 @@ sticky_done:
 			break;
 
 		memset(opt, 0, sizeof(*opt));
+		atomic_set(&opt->refcnt, 1);
 		opt->tot_len = sizeof(*opt) + optlen;
 		retv = -EFAULT;
 		if (copy_from_user(opt+1, optval, optlen))
@@ -502,8 +510,10 @@ update:
 		retv = 0;
 		opt = ipv6_update_options(sk, opt);
 done:
-		if (opt)
-			sock_kfree_s(sk, opt, opt->tot_len);
+		if (opt) {
+			atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+			txopt_put(opt);
+		}
 		break;
 	}
 	case IPV6_UNICAST_HOPS:
@@ -1110,10 +1120,11 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 	case IPV6_RTHDR:
 	case IPV6_DSTOPTS:
 	{
+		struct ipv6_txoptions *opt;
 
 		lock_sock(sk);
-		len = ipv6_getsockopt_sticky(sk, np->opt,
-					     optname, optval, len);
+		opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+		len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len);
 		release_sock(sk);
 		/* check if ipv6_getsockopt_sticky() returns err code */
 		if (len < 0)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index dc65ec198f7c..99140986e887 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -733,6 +733,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
 
 static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
+	struct ipv6_txoptions *opt_to_free = NULL;
 	struct ipv6_txoptions opt_space;
 	DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
 	struct in6_addr *daddr, *final_p, final;
@@ -839,8 +840,10 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		if (!(opt->opt_nflen|opt->opt_flen))
 			opt = NULL;
 	}
-	if (!opt)
-		opt = np->opt;
+	if (!opt) {
+		opt = txopt_get(np);
+		opt_to_free = opt;
+		}
 	if (flowlabel)
 		opt = fl6_merge_options(&opt_space, flowlabel, opt);
 	opt = ipv6_fixup_options(&opt_space, opt);
@@ -906,6 +909,7 @@ done:
 	dst_release(dst);
 out:
 	fl6_sock_release(flowlabel);
+	txopt_put(opt_to_free);
 	return err < 0 ? err : len;
 do_confirm:
 	dst_confirm(dst);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index bb8f2fa1c7fb..eaf7ac496d50 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -222,7 +222,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		memset(&fl6, 0, sizeof(fl6));
 		fl6.flowi6_proto = IPPROTO_TCP;
 		fl6.daddr = ireq->ir_v6_rmt_addr;
-		final_p = fl6_update_dst(&fl6, np->opt, &final);
+		final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
 		fl6.saddr = ireq->ir_v6_loc_addr;
 		fl6.flowi6_oif = sk->sk_bound_dev_if;
 		fl6.flowi6_mark = ireq->ir_mark;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c5429a636f1a..6a50bb4a0dae 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -120,6 +120,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct in6_addr *saddr = NULL, *final_p, final;
+	struct ipv6_txoptions *opt;
 	struct flowi6 fl6;
 	struct dst_entry *dst;
 	int addr_type;
@@ -235,7 +236,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl6.fl6_dport = usin->sin6_port;
 	fl6.fl6_sport = inet->inet_sport;
 
-	final_p = fl6_update_dst(&fl6, np->opt, &final);
+	opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+	final_p = fl6_update_dst(&fl6, opt, &final);
 
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
@@ -263,9 +265,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		tcp_fetch_timewait_stamp(sk, dst);
 
 	icsk->icsk_ext_hdr_len = 0;
-	if (np->opt)
-		icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
-					  np->opt->opt_nflen);
+	if (opt)
+		icsk->icsk_ext_hdr_len = opt->opt_flen +
+					 opt->opt_nflen;
 
 	tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 
@@ -461,7 +463,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 		if (np->repflow && ireq->pktopts)
 			fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
 
-		err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
+		err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt),
+			       np->tclass);
 		err = net_xmit_eval(err);
 	}
 
@@ -972,6 +975,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	struct inet_request_sock *ireq;
 	struct ipv6_pinfo *newnp;
 	const struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6_txoptions *opt;
 	struct tcp6_sock *newtcp6sk;
 	struct inet_sock *newinet;
 	struct tcp_sock *newtp;
@@ -1098,13 +1102,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	   but we make one more one thing there: reattach optmem
 	   to newsk.
 	 */
-	if (np->opt)
-		newnp->opt = ipv6_dup_options(newsk, np->opt);
-
+	opt = rcu_dereference(np->opt);
+	if (opt) {
+		opt = ipv6_dup_options(newsk, opt);
+		RCU_INIT_POINTER(newnp->opt, opt);
+	}
 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
-	if (newnp->opt)
-		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
-						     newnp->opt->opt_flen);
+	if (opt)
+		inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
+						    opt->opt_flen;
 
 	tcp_ca_openreq_child(newsk, dst);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 01bcb49619ee..9da3287a3923 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1110,6 +1110,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
 	struct in6_addr *daddr, *final_p, final;
 	struct ipv6_txoptions *opt = NULL;
+	struct ipv6_txoptions *opt_to_free = NULL;
 	struct ip6_flowlabel *flowlabel = NULL;
 	struct flowi6 fl6;
 	struct dst_entry *dst;
@@ -1263,8 +1264,10 @@ do_udp_sendmsg:
 			opt = NULL;
 		connected = 0;
 	}
-	if (!opt)
-		opt = np->opt;
+	if (!opt) {
+		opt = txopt_get(np);
+		opt_to_free = opt;
+	}
 	if (flowlabel)
 		opt = fl6_merge_options(&opt_space, flowlabel, opt);
 	opt = ipv6_fixup_options(&opt_space, opt);
@@ -1373,6 +1376,7 @@ release_dst:
 out:
 	dst_release(dst);
 	fl6_sock_release(flowlabel);
+	txopt_put(opt_to_free);
 	if (!err)
 		return len;
 	/*
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index aca38d8aed8e..a2c8747d2936 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -486,6 +486,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name);
 	struct in6_addr *daddr, *final_p, final;
 	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6_txoptions *opt_to_free = NULL;
 	struct ipv6_txoptions *opt = NULL;
 	struct ip6_flowlabel *flowlabel = NULL;
 	struct dst_entry *dst = NULL;
@@ -575,8 +576,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			opt = NULL;
 	}
 
-	if (opt == NULL)
-		opt = np->opt;
+	if (!opt) {
+		opt = txopt_get(np);
+		opt_to_free = opt;
+	}
 	if (flowlabel)
 		opt = fl6_merge_options(&opt_space, flowlabel, opt);
 	opt = ipv6_fixup_options(&opt_space, opt);
@@ -631,6 +634,7 @@ done:
 	dst_release(dst);
 out:
 	fl6_sock_release(flowlabel);
+	txopt_put(opt_to_free);
 
 	return err < 0 ? err : len;
 
-- 
cgit v1.2.3-71-gd317


From c0eb454034aab783dc602739237a63b30867f5bd Mon Sep 17 00:00:00 2001
From: KY Srinivasan <kys@microsoft.com>
Date: Tue, 1 Dec 2015 16:43:10 -0800
Subject: hv_netvsc: Don't ask for additional head room in the skb

The rndis header is 116 bytes big and can be placed in the default
head room that will be available in the skb. Since the netvsc packet
is less than 48 bytes, we can use the skb control buffer
for the netvsc packet. With these changes we don't need to
ask for additional head room.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h |  3 +++
 drivers/net/hyperv/netvsc_drv.c | 30 +++++++++++-------------------
 include/linux/netdevice.h       |  4 +++-
 3 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index fc6d0c6de741..731054ef6da5 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -124,6 +124,9 @@ struct ndis_tcp_ip_checksum_info;
 /*
  * Represent netvsc packet which contains 1 RNDIS and 1 ethernet frame
  * within the RNDIS
+ *
+ * The size of this structure is less than 48 bytes and we can now
+ * place this structure in the skb->cb field.
  */
 struct hv_netvsc_packet {
 	/* Bookkeeping stuff */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 7e356a11c1d7..b820888409bc 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -433,7 +433,6 @@ static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
 	u32 net_trans_info;
 	u32 hash;
 	u32 skb_length;
-	u32 pkt_sz;
 	struct hv_page_buffer page_buf[MAX_PAGE_BUFFER_COUNT];
 	struct netvsc_stats *tx_stats = this_cpu_ptr(net_device_ctx->tx_stats);
 
@@ -461,16 +460,21 @@ check_size:
 		goto check_size;
 	}
 
-	pkt_sz = sizeof(struct hv_netvsc_packet) + RNDIS_AND_PPI_SIZE;
-
-	ret = skb_cow_head(skb, pkt_sz);
+	/*
+	 * Place the rndis header in the skb head room and
+	 * the skb->cb will be used for hv_netvsc_packet
+	 * structure.
+	 */
+	ret = skb_cow_head(skb, RNDIS_AND_PPI_SIZE);
 	if (ret) {
 		netdev_err(net, "unable to alloc hv_netvsc_packet\n");
 		ret = -ENOMEM;
 		goto drop;
 	}
-	/* Use the headroom for building up the packet */
-	packet = (struct hv_netvsc_packet *)skb->head;
+	/* Use the skb control buffer for building up the packet */
+	BUILD_BUG_ON(sizeof(struct hv_netvsc_packet) >
+			FIELD_SIZEOF(struct sk_buff, cb));
+	packet = (struct hv_netvsc_packet *)skb->cb;
 
 	packet->status = 0;
 	packet->xmit_more = skb->xmit_more;
@@ -483,8 +487,7 @@ check_size:
 	packet->is_data_pkt = true;
 	packet->total_data_buflen = skb->len;
 
-	rndis_msg = (struct rndis_message *)((unsigned long)packet +
-				sizeof(struct hv_netvsc_packet));
+	rndis_msg = (struct rndis_message *)skb->head;
 
 	memset(rndis_msg, 0, RNDIS_AND_PPI_SIZE);
 
@@ -1118,16 +1121,12 @@ static int netvsc_probe(struct hv_device *dev,
 	struct netvsc_device_info device_info;
 	struct netvsc_device *nvdev;
 	int ret;
-	u32 max_needed_headroom;
 
 	net = alloc_etherdev_mq(sizeof(struct net_device_context),
 				num_online_cpus());
 	if (!net)
 		return -ENOMEM;
 
-	max_needed_headroom = sizeof(struct hv_netvsc_packet) +
-			      RNDIS_AND_PPI_SIZE;
-
 	netif_carrier_off(net);
 
 	net_device_ctx = netdev_priv(net);
@@ -1166,13 +1165,6 @@ static int netvsc_probe(struct hv_device *dev,
 	net->ethtool_ops = &ethtool_ops;
 	SET_NETDEV_DEV(net, &dev->device);
 
-	/*
-	 * Request additional head room in the skb.
-	 * We will use this space to build the rndis
-	 * heaser and other state we need to maintain.
-	 */
-	net->needed_headroom = max_needed_headroom;
-
 	/* Notify the netvsc driver of the new device */
 	memset(&device_info, 0, sizeof(device_info));
 	device_info.ring_size = ring_size;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7d2d1d7aaec7..fcbc5259c630 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -132,7 +132,9 @@ static inline bool dev_xmit_complete(int rc)
  *	used.
  */
 
-#if defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25)
+#if defined(CONFIG_HYPERV_NET)
+# define LL_MAX_HEADER 128
+#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25)
 # if defined(CONFIG_MAC80211_MESH)
 #  define LL_MAX_HEADER 128
 # else
-- 
cgit v1.2.3-71-gd317


From c981e4213e9d2d4ec79501bd607722ec712742a2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:06 +0100
Subject: net: add netif_is_team_master helper

Similar to other helpers, caller can use this to find out if device is
team master.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c   | 1 +
 include/linux/netdevice.h | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 651d35ea22c5..d2f3ee832c47 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2054,6 +2054,7 @@ static void team_setup(struct net_device *dev)
 	dev->flags |= IFF_MULTICAST;
 	dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
 	dev->priv_flags |= IFF_NO_QUEUE;
+	dev->priv_flags |= IFF_TEAM;
 
 	/*
 	 * Indicate we support unicast address filtering. That way core won't
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fcbc5259c630..2b889be65d88 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1273,6 +1273,7 @@ struct net_device_ops {
  * @IFF_NO_QUEUE: device can run without qdisc attached
  * @IFF_OPENVSWITCH: device is a Open vSwitch master
  * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
+ * @IFF_TEAM: device is a team device
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1299,6 +1300,7 @@ enum netdev_priv_flags {
 	IFF_NO_QUEUE			= 1<<21,
 	IFF_OPENVSWITCH			= 1<<22,
 	IFF_L3MDEV_SLAVE		= 1<<23,
+	IFF_TEAM			= 1<<24,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1325,6 +1327,7 @@ enum netdev_priv_flags {
 #define IFF_NO_QUEUE			IFF_NO_QUEUE
 #define IFF_OPENVSWITCH			IFF_OPENVSWITCH
 #define IFF_L3MDEV_SLAVE		IFF_L3MDEV_SLAVE
+#define IFF_TEAM			IFF_TEAM
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -3889,6 +3892,11 @@ static inline bool netif_is_ovs_master(const struct net_device *dev)
 	return dev->priv_flags & IFF_OPENVSWITCH;
 }
 
+static inline bool netif_is_team_master(struct net_device *dev)
+{
+	return dev->priv_flags & IFF_TEAM;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
-- 
cgit v1.2.3-71-gd317


From f7f019ee6d117de5007d0b10e7960696bbf111eb Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:07 +0100
Subject: net: add netif_is_team_port helper

Similar to other helpers, caller can use this to find out if device is
team port.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2b889be65d88..b3601f8a9b42 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3897,6 +3897,11 @@ static inline bool netif_is_team_master(struct net_device *dev)
 	return dev->priv_flags & IFF_TEAM;
 }
 
+static inline bool netif_is_team_port(struct net_device *dev)
+{
+	return dev->priv_flags & IFF_TEAM_PORT;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
-- 
cgit v1.2.3-71-gd317


From 7be61833042e7757745345eedc7b0efee240c189 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:08 +0100
Subject: net: add netif_is_lag_master helper

Some code does not mind if the master is bond or team and treats them
the same, as generic LAG.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b3601f8a9b42..3ca083efa560 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3902,6 +3902,11 @@ static inline bool netif_is_team_port(struct net_device *dev)
 	return dev->priv_flags & IFF_TEAM_PORT;
 }
 
+static inline bool netif_is_lag_master(struct net_device *dev)
+{
+	return netif_is_bond_master(dev) || netif_is_team_master(dev);
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
-- 
cgit v1.2.3-71-gd317


From e0ba1414f310c83bf425fe26fa2cd5f1befcd6dc Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:09 +0100
Subject: net: add netif_is_lag_port helper

Some code does not mind if a device is bond slave or team port and treats
them the same, as generic LAG ports.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ca083efa560..1506be58c59a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3907,6 +3907,11 @@ static inline bool netif_is_lag_master(struct net_device *dev)
 	return netif_is_bond_master(dev) || netif_is_team_master(dev);
 }
 
+static inline bool netif_is_lag_port(struct net_device *dev)
+{
+	return netif_is_bond_slave(dev) || netif_is_team_port(dev);
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
-- 
cgit v1.2.3-71-gd317


From 6dffb0447c25476f499d205dfceb1972e8dae919 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:10 +0100
Subject: net: propagate upper priv via netdev_master_upper_dev_link

Eliminate netdev_master_upper_dev_link_private and pass priv directly as
a parameter of netdev_master_upper_dev_link.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  2 +-
 drivers/net/team/team.c         |  2 +-
 drivers/net/vrf.c               |  2 +-
 include/linux/netdevice.h       |  6 ++----
 net/batman-adv/hard-interface.c |  3 ++-
 net/bridge/br_if.c              |  2 +-
 net/core/dev.c                  | 18 ++++++------------
 net/openvswitch/vport-netdev.c  |  2 +-
 8 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9e0f8a7ef8b1..924015729b2d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1204,7 +1204,7 @@ static int bond_master_upper_dev_link(struct net_device *bond_dev,
 {
 	int err;
 
-	err = netdev_master_upper_dev_link_private(slave_dev, bond_dev, slave);
+	err = netdev_master_upper_dev_link(slave_dev, bond_dev, slave);
 	if (err)
 		return err;
 	slave_dev->flags |= IFF_SLAVE;
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index d2f3ee832c47..b37f8d14dca0 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1083,7 +1083,7 @@ static int team_upper_dev_link(struct net_device *dev,
 {
 	int err;
 
-	err = netdev_master_upper_dev_link(port_dev, dev);
+	err = netdev_master_upper_dev_link(port_dev, dev, NULL);
 	if (err)
 		return err;
 	port_dev->priv_flags |= IFF_TEAM_PORT;
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index c2d54c4ed556..59c5bddeaedd 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -624,7 +624,7 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
 		goto out_fail;
 	}
 
-	ret = netdev_master_upper_dev_link(port_dev, dev);
+	ret = netdev_master_upper_dev_link(port_dev, dev, NULL);
 	if (ret < 0)
 		goto out_unregister;
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1506be58c59a..939b8f3de810 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3619,10 +3619,8 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
 int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev);
 int netdev_master_upper_dev_link(struct net_device *dev,
-				 struct net_device *upper_dev);
-int netdev_master_upper_dev_link_private(struct net_device *dev,
-					 struct net_device *upper_dev,
-					 void *private);
+				 struct net_device *upper_dev,
+				 void *upper_priv);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev);
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index f11345e163d7..a7f4f1085dbb 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -464,7 +464,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
 	hard_iface->soft_iface = soft_iface;
 	bat_priv = netdev_priv(hard_iface->soft_iface);
 
-	ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface);
+	ret = netdev_master_upper_dev_link(hard_iface->net_dev,
+					   soft_iface, NULL);
 	if (ret)
 		goto err_dev;
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ec02f5869a78..781abc34667a 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -493,7 +493,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	dev->priv_flags |= IFF_BRIDGE_PORT;
 
-	err = netdev_master_upper_dev_link(dev, br->dev);
+	err = netdev_master_upper_dev_link(dev, br->dev, NULL);
 	if (err)
 		goto err5;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 939cd1b1da15..27d052bb78bc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5421,7 +5421,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
-				   void *private)
+				   void *upper_priv)
 {
 	struct netdev_notifier_changeupper_info changeupper_info;
 	struct netdev_adjacent *i, *j, *to_i, *to_j;
@@ -5452,7 +5452,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (ret)
 		return ret;
 
-	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
+	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 						   master);
 	if (ret)
 		return ret;
@@ -5557,6 +5557,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  * netdev_master_upper_dev_link - Add a master link to the upper device
  * @dev: device
  * @upper_dev: new upper device
+ * @upper_priv: upper device private
  *
  * Adds a link to device which is upper to this one. In this case, only
  * one master upper device can be linked, although other non-master devices
@@ -5565,20 +5566,13 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  * counts are adjusted and the function returns zero.
  */
 int netdev_master_upper_dev_link(struct net_device *dev,
-				 struct net_device *upper_dev)
+				 struct net_device *upper_dev,
+				 void *upper_priv)
 {
-	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
+	return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv);
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_link);
 
-int netdev_master_upper_dev_link_private(struct net_device *dev,
-					 struct net_device *upper_dev,
-					 void *private)
-{
-	return __netdev_upper_dev_link(dev, upper_dev, true, private);
-}
-EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
-
 /**
  * netdev_upper_dev_unlink - Removes a link to upper device
  * @dev: device
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index b327368a3848..3ee3df1edeae 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -105,7 +105,7 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
 
 	rtnl_lock();
 	err = netdev_master_upper_dev_link(vport->dev,
-					   get_dpdev(vport->dp));
+					   get_dpdev(vport->dp), NULL);
 	if (err)
 		goto error_unlock;
 
-- 
cgit v1.2.3-71-gd317


From 29bf24afb29042f568fa67b1b0eee46796725ed2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:11 +0100
Subject: net: add possibility to pass information about upper device via
 notifier

Sometimes the drivers and other code would find it handy to know some
internal information about upper device being changed. So allow upper-code
to pass information down to notifier listeners during linking.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |  2 +-
 drivers/net/team/team.c         |  2 +-
 drivers/net/vrf.c               |  2 +-
 include/linux/netdevice.h       |  3 ++-
 net/batman-adv/hard-interface.c |  2 +-
 net/bridge/br_if.c              |  2 +-
 net/core/dev.c                  | 11 +++++++----
 net/openvswitch/vport-netdev.c  |  2 +-
 8 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 924015729b2d..fa3ed1d8a12d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1204,7 +1204,7 @@ static int bond_master_upper_dev_link(struct net_device *bond_dev,
 {
 	int err;
 
-	err = netdev_master_upper_dev_link(slave_dev, bond_dev, slave);
+	err = netdev_master_upper_dev_link(slave_dev, bond_dev, slave, NULL);
 	if (err)
 		return err;
 	slave_dev->flags |= IFF_SLAVE;
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index b37f8d14dca0..f7b6ff7948b8 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1083,7 +1083,7 @@ static int team_upper_dev_link(struct net_device *dev,
 {
 	int err;
 
-	err = netdev_master_upper_dev_link(port_dev, dev, NULL);
+	err = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
 	if (err)
 		return err;
 	port_dev->priv_flags |= IFF_TEAM_PORT;
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 59c5bddeaedd..8944a49cda15 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -624,7 +624,7 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
 		goto out_fail;
 	}
 
-	ret = netdev_master_upper_dev_link(port_dev, dev, NULL);
+	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
 	if (ret < 0)
 		goto out_unregister;
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 939b8f3de810..aea556c64f2c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2163,6 +2163,7 @@ struct netdev_notifier_changeupper_info {
 	struct net_device *upper_dev; /* new upper dev */
 	bool master; /* is upper dev master */
 	bool linking; /* is the nofication for link or unlink */
+	void *upper_info; /* upper dev info */
 };
 
 static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
@@ -3620,7 +3621,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
 int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev);
 int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct net_device *upper_dev,
-				 void *upper_priv);
+				 void *upper_priv, void *upper_info);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev);
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index a7f4f1085dbb..aa8867e1d983 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -465,7 +465,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
 	bat_priv = netdev_priv(hard_iface->soft_iface);
 
 	ret = netdev_master_upper_dev_link(hard_iface->net_dev,
-					   soft_iface, NULL);
+					   soft_iface, NULL, NULL);
 	if (ret)
 		goto err_dev;
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 781abc34667a..8d1d4a22c50d 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -493,7 +493,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	dev->priv_flags |= IFF_BRIDGE_PORT;
 
-	err = netdev_master_upper_dev_link(dev, br->dev, NULL);
+	err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL);
 	if (err)
 		goto err5;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 27d052bb78bc..8ed886663c6d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5421,7 +5421,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
-				   void *upper_priv)
+				   void *upper_priv, void *upper_info)
 {
 	struct netdev_notifier_changeupper_info changeupper_info;
 	struct netdev_adjacent *i, *j, *to_i, *to_j;
@@ -5445,6 +5445,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	changeupper_info.upper_dev = upper_dev;
 	changeupper_info.master = master;
 	changeupper_info.linking = true;
+	changeupper_info.upper_info = upper_info;
 
 	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
 					    &changeupper_info.info);
@@ -5549,7 +5550,7 @@ rollback_mesh:
 int netdev_upper_dev_link(struct net_device *dev,
 			  struct net_device *upper_dev)
 {
-	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
+	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
 }
 EXPORT_SYMBOL(netdev_upper_dev_link);
 
@@ -5558,6 +5559,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  * @dev: device
  * @upper_dev: new upper device
  * @upper_priv: upper device private
+ * @upper_info: upper info to be passed down via notifier
  *
  * Adds a link to device which is upper to this one. In this case, only
  * one master upper device can be linked, although other non-master devices
@@ -5567,9 +5569,10 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  */
 int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct net_device *upper_dev,
-				 void *upper_priv)
+				 void *upper_priv, void *upper_info)
 {
-	return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv);
+	return __netdev_upper_dev_link(dev, upper_dev, true,
+				       upper_priv, upper_info);
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_link);
 
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 3ee3df1edeae..8f4dd4c39bfe 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -105,7 +105,7 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
 
 	rtnl_lock();
 	err = netdev_master_upper_dev_link(vport->dev,
-					   get_dpdev(vport->dp), NULL);
+					   get_dpdev(vport->dp), NULL, NULL);
 	if (err)
 		goto error_unlock;
 
-- 
cgit v1.2.3-71-gd317


From 764f5e544118508add420724789f46e04dba91eb Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:12 +0100
Subject: net: add info struct for LAG changeupper

This struct will be shared by bonding and team to pass internal
information to notifier listeners.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index aea556c64f2c..3ab90ea0ed03 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2110,6 +2110,19 @@ struct pcpu_sw_netstats {
 #define netdev_alloc_pcpu_stats(type)					\
 	__netdev_alloc_pcpu_stats(type, GFP_KERNEL);
 
+enum netdev_lag_tx_type {
+	NETDEV_LAG_TX_TYPE_UNKNOWN,
+	NETDEV_LAG_TX_TYPE_RANDOM,
+	NETDEV_LAG_TX_TYPE_BROADCAST,
+	NETDEV_LAG_TX_TYPE_ROUNDROBIN,
+	NETDEV_LAG_TX_TYPE_ACTIVEBACKUP,
+	NETDEV_LAG_TX_TYPE_HASH,
+};
+
+struct netdev_lag_upper_info {
+	enum netdev_lag_tx_type tx_type;
+};
+
 #include <linux/notifier.h>
 
 /* netdevice notifier chain. Please remember to update the rtnetlink
-- 
cgit v1.2.3-71-gd317


From 8fd728566a354f7bc9cb6e781f185b8c39cf505b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:13 +0100
Subject: team: fill-up LAG changeupper info struct and pass it along

Initialize netdev_lag_upper_info structure by TX type according to
current team mode and pass it along via netdev_master_upper_dev_link.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c                   | 23 ++++++++++++-----------
 drivers/net/team/team_mode_activebackup.c |  1 +
 drivers/net/team/team_mode_broadcast.c    |  1 +
 drivers/net/team/team_mode_loadbalance.c  |  1 +
 drivers/net/team/team_mode_random.c       |  1 +
 drivers/net/team/team_mode_roundrobin.c   |  1 +
 include/linux/if_team.h                   |  1 +
 7 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index f7b6ff7948b8..dd1504bbb4a7 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1078,23 +1078,24 @@ static void team_port_disable_netpoll(struct team_port *port)
 }
 #endif
 
-static int team_upper_dev_link(struct net_device *dev,
-			       struct net_device *port_dev)
+static int team_upper_dev_link(struct team *team, struct team_port *port)
 {
+	struct netdev_lag_upper_info lag_upper_info;
 	int err;
 
-	err = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
+	lag_upper_info.tx_type = team->mode->lag_tx_type;
+	err = netdev_master_upper_dev_link(port->dev, team->dev, NULL,
+					   &lag_upper_info);
 	if (err)
 		return err;
-	port_dev->priv_flags |= IFF_TEAM_PORT;
+	port->dev->priv_flags |= IFF_TEAM_PORT;
 	return 0;
 }
 
-static void team_upper_dev_unlink(struct net_device *dev,
-				  struct net_device *port_dev)
+static void team_upper_dev_unlink(struct team *team, struct team_port *port)
 {
-	netdev_upper_dev_unlink(port_dev, dev);
-	port_dev->priv_flags &= ~IFF_TEAM_PORT;
+	netdev_upper_dev_unlink(port->dev, team->dev);
+	port->dev->priv_flags &= ~IFF_TEAM_PORT;
 }
 
 static void __team_port_change_port_added(struct team_port *port, bool linkup);
@@ -1194,7 +1195,7 @@ static int team_port_add(struct team *team, struct net_device *port_dev)
 		goto err_handler_register;
 	}
 
-	err = team_upper_dev_link(dev, port_dev);
+	err = team_upper_dev_link(team, port);
 	if (err) {
 		netdev_err(dev, "Device %s failed to set upper link\n",
 			   portname);
@@ -1220,7 +1221,7 @@ static int team_port_add(struct team *team, struct net_device *port_dev)
 	return 0;
 
 err_option_port_add:
-	team_upper_dev_unlink(dev, port_dev);
+	team_upper_dev_unlink(team, port);
 
 err_set_upper_link:
 	netdev_rx_handler_unregister(port_dev);
@@ -1264,7 +1265,7 @@ static int team_port_del(struct team *team, struct net_device *port_dev)
 
 	team_port_disable(team, port);
 	list_del_rcu(&port->list);
-	team_upper_dev_unlink(dev, port_dev);
+	team_upper_dev_unlink(team, port);
 	netdev_rx_handler_unregister(port_dev);
 	team_port_disable_netpoll(port);
 	vlan_vids_del_by_dev(port_dev, dev);
diff --git a/drivers/net/team/team_mode_activebackup.c b/drivers/net/team/team_mode_activebackup.c
index 40fd3381b693..3f189823ba3b 100644
--- a/drivers/net/team/team_mode_activebackup.c
+++ b/drivers/net/team/team_mode_activebackup.c
@@ -127,6 +127,7 @@ static const struct team_mode ab_mode = {
 	.owner		= THIS_MODULE,
 	.priv_size	= sizeof(struct ab_priv),
 	.ops		= &ab_mode_ops,
+	.lag_tx_type	= NETDEV_LAG_TX_TYPE_ACTIVEBACKUP,
 };
 
 static int __init ab_init_module(void)
diff --git a/drivers/net/team/team_mode_broadcast.c b/drivers/net/team/team_mode_broadcast.c
index c366cd299c06..302ff35b0cbc 100644
--- a/drivers/net/team/team_mode_broadcast.c
+++ b/drivers/net/team/team_mode_broadcast.c
@@ -56,6 +56,7 @@ static const struct team_mode bc_mode = {
 	.kind		= "broadcast",
 	.owner		= THIS_MODULE,
 	.ops		= &bc_mode_ops,
+	.lag_tx_type	= NETDEV_LAG_TX_TYPE_BROADCAST,
 };
 
 static int __init bc_init_module(void)
diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
index a1536d0d83a9..cdb19b385d42 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -661,6 +661,7 @@ static const struct team_mode lb_mode = {
 	.priv_size	= sizeof(struct lb_priv),
 	.port_priv_size	= sizeof(struct lb_port_priv),
 	.ops		= &lb_mode_ops,
+	.lag_tx_type	= NETDEV_LAG_TX_TYPE_HASH,
 };
 
 static int __init lb_init_module(void)
diff --git a/drivers/net/team/team_mode_random.c b/drivers/net/team/team_mode_random.c
index cd2f692b8074..215f845782db 100644
--- a/drivers/net/team/team_mode_random.c
+++ b/drivers/net/team/team_mode_random.c
@@ -46,6 +46,7 @@ static const struct team_mode rnd_mode = {
 	.kind		= "random",
 	.owner		= THIS_MODULE,
 	.ops		= &rnd_mode_ops,
+	.lag_tx_type	= NETDEV_LAG_TX_TYPE_RANDOM,
 };
 
 static int __init rnd_init_module(void)
diff --git a/drivers/net/team/team_mode_roundrobin.c b/drivers/net/team/team_mode_roundrobin.c
index 53665850b59e..0aa234118c03 100644
--- a/drivers/net/team/team_mode_roundrobin.c
+++ b/drivers/net/team/team_mode_roundrobin.c
@@ -58,6 +58,7 @@ static const struct team_mode rr_mode = {
 	.owner		= THIS_MODULE,
 	.priv_size	= sizeof(struct rr_priv),
 	.ops		= &rr_mode_ops,
+	.lag_tx_type	= NETDEV_LAG_TX_TYPE_ROUNDROBIN,
 };
 
 static int __init rr_init_module(void)
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index a6aa970758a2..b84e49c3a738 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -164,6 +164,7 @@ struct team_mode {
 	size_t priv_size;
 	size_t port_priv_size;
 	const struct team_mode_ops *ops;
+	enum netdev_lag_tx_type lag_tx_type;
 };
 
 #define TEAM_PORT_HASHBITS 4
-- 
cgit v1.2.3-71-gd317


From 04d482660a07039fc4e9a42bb3517db236d98f96 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:15 +0100
Subject: net: introduce change lower state notifier

When lower device like bonding slave, team/bridge port, etc changes its
state, it is useful for others to notice this change. Currently this is
implemented specificly for bonding as NETDEV_BONDING_INFO notifier. This
patch aims to replace this specific usage and make this more generic to
be used for all upper-lower devices.

Introduce NETDEV_CHANGELOWERSTATE netdev notifier type and
netdev_lower_state_changed() helper.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  8 ++++++++
 net/core/dev.c            | 20 ++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ab90ea0ed03..ad69f237aa78 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2158,6 +2158,7 @@ struct netdev_lag_upper_info {
 #define NETDEV_CHANGEINFODATA	0x0018
 #define NETDEV_BONDING_INFO	0x0019
 #define NETDEV_PRECHANGEUPPER	0x001A
+#define NETDEV_CHANGELOWERSTATE	0x001B
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
@@ -2179,6 +2180,11 @@ struct netdev_notifier_changeupper_info {
 	void *upper_info; /* upper dev info */
 };
 
+struct netdev_notifier_changelowerstate_info {
+	struct netdev_notifier_info info; /* must be first */
+	void *lower_state_info; /* is lower dev state */
+};
+
 static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
 					     struct net_device *dev)
 {
@@ -3640,6 +3646,8 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
 void *netdev_lower_dev_get_private(struct net_device *dev,
 				   struct net_device *lower_dev);
+void netdev_lower_state_changed(struct net_device *lower_dev,
+				void *lower_state_info);
 
 /* RSS keys are 40 or 52 bytes long */
 #define NETDEV_RSS_KEY_LEN 52
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ed886663c6d..d1706e88fbeb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5756,6 +5756,26 @@ int dev_get_nest_level(struct net_device *dev,
 }
 EXPORT_SYMBOL(dev_get_nest_level);
 
+/**
+ * netdev_lower_change - Dispatch event about lower device state change
+ * @lower_dev: device
+ * @lower_state_info: state to dispatch
+ *
+ * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
+ */
+void netdev_lower_state_changed(struct net_device *lower_dev,
+				void *lower_state_info)
+{
+	struct netdev_notifier_changelowerstate_info changelowerstate_info;
+
+	ASSERT_RTNL();
+	changelowerstate_info.lower_state_info = lower_state_info;
+	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
+				      &changelowerstate_info.info);
+}
+EXPORT_SYMBOL(netdev_lower_state_changed);
+
 static void dev_change_rx_flags(struct net_device *dev, int flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-- 
cgit v1.2.3-71-gd317


From fb1b2e3ce53aef80b3cef71f3885d584cdbdc6b8 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:16 +0100
Subject: net: introduce lower state changed info structure for LAG lowers

This is shared info structure for bonding and team. Serves to pass down
info about link state and port activity to notification listeners.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ad69f237aa78..fa84b59eb197 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2123,6 +2123,11 @@ struct netdev_lag_upper_info {
 	enum netdev_lag_tx_type tx_type;
 };
 
+struct netdev_lag_lower_state_info {
+	u8 link_up : 1,
+	   tx_enabled : 1;
+};
+
 #include <linux/notifier.h>
 
 /* netdevice notifier chain. Please remember to update the rtnetlink
-- 
cgit v1.2.3-71-gd317


From fc50db98ff872372f266695858f87a12eb1b4f05 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:09 +0200
Subject: net/mlx5_core: Add base sriov support

This patch adds SRIOV base support for mlx5 supported devices. The same
driver is used for both PFs and VFs; VFs are identified by the driver
through the flag MLX5_PCI_DEV_IS_VF added to the pci table entries.
Virtual functions are created as usual through writing a value to the
sriov_numvs sysfs file of the PF device. Upon instantiating VFs, they will
all be probed by the driver on the hypervisor. One can gracefully unbind
them through /sys/bus/pci/drivers/mlx5_core/unbind.

mlx5_wait_for_vf_pages() was added to ensure that when a VF dies without
executing proper teardown, the hypervisor driver waits till all of the
pages that were allocated at the hypervisor to maintain its operation
are returned.

In order for the VF to be operational, the PF needs to call enable_hca
for it. This can be done before the VFs are created through a call to
pci_enable_sriov.

If the there are VFs assigned to a VMs when the driver of the PF is
unloaded, all the VF will experience system error and PF driver unloads
cleanly; in this case pci_disable_sriov is not called and the devices
will show when running lspci. Once the PF driver is reloaded, it will
sync its data structures which maintain state on its VFs.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  36 +++-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   2 +
 .../net/ethernet/mellanox/mlx5/core/pagealloc.c    |  38 ++++
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c    | 221 +++++++++++++++++++++
 include/linux/mlx5/driver.h                        |  24 +++
 include/linux/mlx5/mlx5_ifc.h                      |   4 +-
 7 files changed, 318 insertions(+), 9 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sriov.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 26a68b8af2c5..4d5103911527 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
 
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
-		mad.o transobj.o vport.o
+		mad.o transobj.o vport.o sriov.o
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o flow_table.o \
 		en_main.o en_flow_table.o en_ethtool.o en_tx.o en_rx.o \
 		en_txrx.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f2e64dc1a443..66e2b37cfbbf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -454,6 +454,9 @@ static int set_hca_ctrl(struct mlx5_core_dev *dev)
 	struct mlx5_reg_host_endianess he_out;
 	int err;
 
+	if (!mlx5_core_is_pf(dev))
+		return 0;
+
 	memset(&he_in, 0, sizeof(he_in));
 	he_in.he = MLX5_SET_HOST_ENDIANNESS;
 	err = mlx5_core_access_reg(dev, &he_in,  sizeof(he_in),
@@ -1049,6 +1052,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	mlx5_init_srq_table(dev);
 	mlx5_init_mr_table(dev);
 
+	err = mlx5_sriov_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "sriov init failed %d\n", err);
+		goto err_sriov;
+	}
+
 	err = mlx5_register_device(dev);
 	if (err) {
 		dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err);
@@ -1065,6 +1074,10 @@ out:
 
 	return 0;
 
+err_sriov:
+	if (mlx5_sriov_cleanup(dev))
+		dev_err(&dev->pdev->dev, "sriov cleanup failed\n");
+
 err_reg_dev:
 	mlx5_cleanup_mr_table(dev);
 	mlx5_cleanup_srq_table(dev);
@@ -1120,6 +1133,13 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 {
 	int err = 0;
 
+	err = mlx5_sriov_cleanup(dev);
+	if (err) {
+		dev_warn(&dev->pdev->dev, "%s: sriov cleanup failed - abort\n",
+			 __func__);
+		return err;
+	}
+
 	mutex_lock(&dev->intf_state_mutex);
 	if (dev->interface_state == MLX5_INTERFACE_STATE_DOWN) {
 		dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
@@ -1192,6 +1212,7 @@ static int init_one(struct pci_dev *pdev,
 		return -ENOMEM;
 	}
 	priv = &dev->priv;
+	priv->pci_dev_data = id->driver_data;
 
 	pci_set_drvdata(pdev, dev);
 
@@ -1362,12 +1383,12 @@ static const struct pci_error_handlers mlx5_err_handler = {
 };
 
 static const struct pci_device_id mlx5_core_pci_table[] = {
-	{ PCI_VDEVICE(MELLANOX, 0x1011) }, /* Connect-IB */
-	{ PCI_VDEVICE(MELLANOX, 0x1012) }, /* Connect-IB VF */
-	{ PCI_VDEVICE(MELLANOX, 0x1013) }, /* ConnectX-4 */
-	{ PCI_VDEVICE(MELLANOX, 0x1014) }, /* ConnectX-4 VF */
-	{ PCI_VDEVICE(MELLANOX, 0x1015) }, /* ConnectX-4LX */
-	{ PCI_VDEVICE(MELLANOX, 0x1016) }, /* ConnectX-4LX VF */
+	{ PCI_VDEVICE(MELLANOX, 0x1011) },			/* Connect-IB */
+	{ PCI_VDEVICE(MELLANOX, 0x1012), MLX5_PCI_DEV_IS_VF},	/* Connect-IB VF */
+	{ PCI_VDEVICE(MELLANOX, 0x1013) },			/* ConnectX-4 */
+	{ PCI_VDEVICE(MELLANOX, 0x1014), MLX5_PCI_DEV_IS_VF},	/* ConnectX-4 VF */
+	{ PCI_VDEVICE(MELLANOX, 0x1015) },			/* ConnectX-4LX */
+	{ PCI_VDEVICE(MELLANOX, 0x1016), MLX5_PCI_DEV_IS_VF},	/* ConnectX-4LX VF */
 	{ 0, }
 };
 
@@ -1378,7 +1399,8 @@ static struct pci_driver mlx5_core_driver = {
 	.id_table       = mlx5_core_pci_table,
 	.probe          = init_one,
 	.remove         = remove_one,
-	.err_handler	= &mlx5_err_handler
+	.err_handler	= &mlx5_err_handler,
+	.sriov_configure   = mlx5_core_sriov_configure,
 };
 
 static int __init init(void)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 1ed2239a6a6d..1649d5cf9e29 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -90,8 +90,10 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 		     unsigned long param);
 void mlx5_enter_error_state(struct mlx5_core_dev *dev);
 void mlx5_disable_device(struct mlx5_core_dev *dev);
+int mlx5_core_sriov_configure(struct pci_dev *dev, int num_vfs);
 int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id);
 int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id);
+int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev);
 
 void mlx5e_init(void);
 void mlx5e_cleanup(void);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
index 4d3377b12657..9eeee0545f1c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -33,6 +33,7 @@
 #include <linux/highmem.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/delay.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
@@ -95,6 +96,7 @@ struct mlx5_manage_pages_outbox {
 
 enum {
 	MAX_RECLAIM_TIME_MSECS	= 5000,
+	MAX_RECLAIM_VFS_PAGES_TIME_MSECS = 2 * 1000 * 60,
 };
 
 enum {
@@ -352,6 +354,10 @@ retry:
 		goto out_4k;
 	}
 
+	dev->priv.fw_pages += npages;
+	if (func_id)
+		dev->priv.vfs_pages += npages;
+
 	mlx5_core_dbg(dev, "err %d\n", err);
 
 	kvfree(in);
@@ -405,6 +411,12 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
 	}
 
 	num_claimed = be32_to_cpu(out->num_entries);
+	if (num_claimed > npages) {
+		mlx5_core_warn(dev, "fw returned %d, driver asked %d => corruption\n",
+			       num_claimed, npages);
+		err = -EINVAL;
+		goto out_free;
+	}
 	if (nclaimed)
 		*nclaimed = num_claimed;
 
@@ -412,6 +424,9 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
 		addr = be64_to_cpu(out->pas[i]);
 		free_4k(dev, addr);
 	}
+	dev->priv.fw_pages -= num_claimed;
+	if (func_id)
+		dev->priv.vfs_pages -= num_claimed;
 
 out_free:
 	kvfree(out);
@@ -548,3 +563,26 @@ void mlx5_pagealloc_stop(struct mlx5_core_dev *dev)
 {
 	destroy_workqueue(dev->priv.pg_wq);
 }
+
+int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev)
+{
+	unsigned long end = jiffies + msecs_to_jiffies(MAX_RECLAIM_VFS_PAGES_TIME_MSECS);
+	int prev_vfs_pages = dev->priv.vfs_pages;
+
+	mlx5_core_dbg(dev, "Waiting for %d pages from %s\n", prev_vfs_pages,
+		      dev->priv.name);
+	while (dev->priv.vfs_pages) {
+		if (time_after(jiffies, end)) {
+			mlx5_core_warn(dev, "aborting while there are %d pending pages\n", dev->priv.vfs_pages);
+			return -ETIMEDOUT;
+		}
+		if (dev->priv.vfs_pages < prev_vfs_pages) {
+			end = jiffies + msecs_to_jiffies(MAX_RECLAIM_VFS_PAGES_TIME_MSECS);
+			prev_vfs_pages = dev->priv.vfs_pages;
+		}
+		msleep(50);
+	}
+
+	mlx5_core_dbg(dev, "All pages received from %s\n", dev->priv.name);
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
new file mode 100644
index 000000000000..19a43240e359
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2014, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+
+static void enable_vfs(struct mlx5_core_dev *dev, int num_vfs)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int err;
+	int vf;
+
+	for (vf = 1; vf <= num_vfs; vf++) {
+		err = mlx5_core_enable_hca(dev, vf);
+		if (err) {
+			mlx5_core_warn(dev, "failed to enable VF %d\n", vf - 1);
+		} else {
+			sriov->vfs_ctx[vf - 1].enabled = 1;
+			mlx5_core_dbg(dev, "successfully enabled VF %d\n", vf - 1);
+		}
+	}
+}
+
+static void disable_vfs(struct mlx5_core_dev *dev, int num_vfs)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int vf;
+
+	for (vf = 1; vf <= num_vfs; vf++) {
+		if (sriov->vfs_ctx[vf - 1].enabled) {
+			if (mlx5_core_disable_hca(dev, vf))
+				mlx5_core_warn(dev, "failed to disable VF %d\n", vf - 1);
+			else
+				sriov->vfs_ctx[vf - 1].enabled = 0;
+		}
+	}
+}
+
+static int mlx5_core_create_vfs(struct pci_dev *pdev, int num_vfs)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	int err;
+
+	if (pci_num_vf(pdev))
+		pci_disable_sriov(pdev);
+
+	enable_vfs(dev, num_vfs);
+
+	err = pci_enable_sriov(pdev, num_vfs);
+	if (err) {
+		dev_warn(&pdev->dev, "enable sriov failed %d\n", err);
+		goto ex;
+	}
+
+	return 0;
+
+ex:
+	disable_vfs(dev, num_vfs);
+	return err;
+}
+
+static int mlx5_core_sriov_enable(struct pci_dev *pdev, int num_vfs)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int err;
+
+	kfree(sriov->vfs_ctx);
+	sriov->vfs_ctx = kcalloc(num_vfs, sizeof(*sriov->vfs_ctx), GFP_ATOMIC);
+	if (!sriov->vfs_ctx)
+		return -ENOMEM;
+
+	sriov->enabled_vfs = num_vfs;
+	err = mlx5_core_create_vfs(pdev, num_vfs);
+	if (err) {
+		kfree(sriov->vfs_ctx);
+		sriov->vfs_ctx = NULL;
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx5_core_init_vfs(struct mlx5_core_dev *dev, int num_vfs)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+
+	sriov->num_vfs = num_vfs;
+}
+
+static void mlx5_core_cleanup_vfs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov;
+
+	sriov = &dev->priv.sriov;
+	disable_vfs(dev, sriov->num_vfs);
+
+	if (mlx5_wait_for_vf_pages(dev))
+		mlx5_core_warn(dev, "timeout claiming VFs pages\n");
+
+	sriov->num_vfs = 0;
+}
+
+int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int err;
+
+	mlx5_core_dbg(dev, "requsted num_vfs %d\n", num_vfs);
+	if (!mlx5_core_is_pf(dev))
+		return -EPERM;
+
+	mlx5_core_cleanup_vfs(dev);
+
+	if (!num_vfs) {
+		kfree(sriov->vfs_ctx);
+		sriov->vfs_ctx = NULL;
+		if (!pci_vfs_assigned(pdev))
+			pci_disable_sriov(pdev);
+		else
+			pr_info("unloading PF driver while leaving orphan VFs\n");
+
+		return 0;
+	}
+
+	err = mlx5_core_sriov_enable(pdev, num_vfs);
+	if (err) {
+		dev_warn(&pdev->dev, "mlx5_core_sriov_enable failed %d\n", err);
+		return err;
+	}
+
+	mlx5_core_init_vfs(dev, num_vfs);
+
+	return num_vfs;
+}
+
+static int sync_required(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int cur_vfs = pci_num_vf(pdev);
+
+	if (cur_vfs != sriov->num_vfs) {
+		pr_info("current VFs %d, registered %d - sync needed\n", cur_vfs, sriov->num_vfs);
+		return 1;
+	}
+
+	return 0;
+}
+
+int mlx5_sriov_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	struct pci_dev *pdev = dev->pdev;
+	int cur_vfs;
+
+	if (!mlx5_core_is_pf(dev))
+		return 0;
+
+	if (!sync_required(dev->pdev))
+		return 0;
+
+	cur_vfs = pci_num_vf(pdev);
+	sriov->vfs_ctx = kcalloc(cur_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL);
+	if (!sriov->vfs_ctx)
+		return -ENOMEM;
+
+	sriov->enabled_vfs = cur_vfs;
+
+	mlx5_core_init_vfs(dev, cur_vfs);
+
+	enable_vfs(dev, cur_vfs);
+
+	return 0;
+}
+
+int mlx5_sriov_cleanup(struct mlx5_core_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int err;
+
+	if (!mlx5_core_is_pf(dev))
+		return 0;
+
+	err = mlx5_core_sriov_configure(pdev, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5c857f2a20d7..efebb87163c8 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -426,6 +426,16 @@ struct mlx5_mr_table {
 	struct radix_tree_root	tree;
 };
 
+struct mlx5_vf_context {
+	int	enabled;
+};
+
+struct mlx5_core_sriov {
+	struct mlx5_vf_context	*vfs_ctx;
+	int			num_vfs;
+	int			enabled_vfs;
+};
+
 struct mlx5_irq_info {
 	cpumask_var_t mask;
 	char name[MLX5_MAX_IRQ_NAME];
@@ -447,6 +457,7 @@ struct mlx5_priv {
 	int			fw_pages;
 	atomic_t		reg_pages;
 	struct list_head	free_list;
+	int			vfs_pages;
 
 	struct mlx5_core_health health;
 
@@ -485,6 +496,8 @@ struct mlx5_priv {
 	struct list_head        dev_list;
 	struct list_head        ctx_list;
 	spinlock_t              ctx_lock;
+	struct mlx5_core_sriov	sriov;
+	unsigned long		pci_dev_data;
 };
 
 enum mlx5_device_state {
@@ -739,6 +752,8 @@ void mlx5_pagealloc_init(struct mlx5_core_dev *dev);
 void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev);
 int mlx5_pagealloc_start(struct mlx5_core_dev *dev);
 void mlx5_pagealloc_stop(struct mlx5_core_dev *dev);
+int mlx5_sriov_init(struct mlx5_core_dev *dev);
+int mlx5_sriov_cleanup(struct mlx5_core_dev *dev);
 void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
 				 s32 npages);
 int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot);
@@ -884,6 +899,15 @@ struct mlx5_profile {
 	} mr_cache[MAX_MR_CACHE_ENTRIES];
 };
 
+enum {
+	MLX5_PCI_DEV_IS_VF		= 1 << 0,
+};
+
+static inline int mlx5_core_is_pf(struct mlx5_core_dev *dev)
+{
+	return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF);
+}
+
 static inline int mlx5_get_gid_table_len(u16 param)
 {
 	if (param > 4) {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1565324eb620..9b76fddd696b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -665,7 +665,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_17[0x1];
 	u8         ets[0x1];
 	u8         nic_flow_table[0x1];
-	u8         reserved_18[0x4];
+	u8	   reserved_18_0;
+	u8	   early_vf_enable;
+	u8         reserved_18[0x2];
 	u8         local_ca_ack_delay[0x5];
 	u8         reserved_19[0x6];
 	u8         port_type[0x2];
-- 
cgit v1.2.3-71-gd317


From 54f0a411ec72cb437d57d0c9654dcbd0f198ff3a Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:10 +0200
Subject: net/mlx5: Add HW capabilities and structs for SR-IOV E-Switch

Update HCA capabilities and HW struct to include needed
capabilities for upcoming Ethernet Switch (SR-IOV E-Switch).

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 9b76fddd696b..836cf0e43174 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -665,7 +665,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_17[0x1];
 	u8         ets[0x1];
 	u8         nic_flow_table[0x1];
-	u8	   reserved_18_0;
+	u8         eswitch_flow_table[0x1];
 	u8	   early_vf_enable;
 	u8         reserved_18[0x2];
 	u8         local_ca_ack_delay[0x5];
@@ -789,22 +789,30 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_60[0x1b];
 	u8         log_max_wq_sz[0x5];
 
-	u8         reserved_61[0xa0];
-
+	u8         nic_vport_change_event[0x1];
+	u8         reserved_61[0xa];
+	u8         log_max_vlan_list[0x5];
 	u8         reserved_62[0x3];
+	u8         log_max_current_mc_list[0x5];
+	u8         reserved_63[0x3];
+	u8         log_max_current_uc_list[0x5];
+
+	u8         reserved_64[0x80];
+
+	u8         reserved_65[0x3];
 	u8         log_max_l2_table[0x5];
-	u8         reserved_63[0x8];
+	u8         reserved_66[0x8];
 	u8         log_uar_page_sz[0x10];
 
-	u8         reserved_64[0x100];
+	u8         reserved_67[0xe0];
 
-	u8         reserved_65[0x1f];
+	u8         reserved_68[0x1f];
 	u8         cqe_zip[0x1];
 
 	u8         cqe_zip_timeout[0x10];
 	u8         cqe_zip_max_num[0x10];
 
-	u8         reserved_66[0x220];
+	u8         reserved_69[0x220];
 };
 
 enum {
@@ -2135,10 +2143,6 @@ struct mlx5_ifc_rmpc_bits {
 	struct mlx5_ifc_wq_bits wq;
 };
 
-enum {
-	MLX5_NIC_VPORT_CONTEXT_ALLOWED_LIST_TYPE_CURRENT_UC_MAC_ADDRESS  = 0x0,
-};
-
 struct mlx5_ifc_nic_vport_context_bits {
 	u8         reserved_0[0x1f];
 	u8         roce_en[0x1];
-- 
cgit v1.2.3-71-gd317


From e1d7d349c69d12721c420f1fe673ce9aa462aadd Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:11 +0200
Subject: net/mlx5: Update access functions to Query/Modify vport MAC address

In preparation for SR-IOV we add here an API to enable each e-switch
client (PF/VF) to configure its L2 MAC addresses and for the e-switch
manager (usually the PF) to access them in order to be able to
configure them into the e-switch.
Therefore we now pass vport num parameter to
mlx5_query_nic_vport_context, so PF can access other vports contexts.

preperation for ethernet sriov and l2 table management.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/vport.c   | 87 ++++++++++++++++++++---
 include/linux/mlx5/vport.h                        |  5 +-
 3 files changed, 81 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f6a8cc787603..2ef717f98b0d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2028,7 +2028,7 @@ static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 
-	mlx5_query_nic_vport_mac_address(priv->mdev, netdev->dev_addr);
+	mlx5_query_nic_vport_mac_address(priv->mdev, 0, netdev->dev_addr);
 }
 
 static void mlx5e_build_netdev(struct net_device *netdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index b94177ebcf3a..442916e98724 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -57,33 +57,98 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod)
 }
 EXPORT_SYMBOL(mlx5_query_vport_state);
 
-void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr)
+static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
+					u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+
+	return mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out, outlen);
+}
+
+static int mlx5_modify_nic_vport_context(struct mlx5_core_dev *mdev, void *in,
+					 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)];
+
+	MLX5_SET(modify_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec_check_status(mdev, in, inlen, out, sizeof(out));
+}
+
+int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
+				     u16 vport, u8 *addr)
 {
-	u32  in[MLX5_ST_SZ_DW(query_nic_vport_context_in)];
 	u32 *out;
 	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
 	u8 *out_addr;
+	int err;
 
 	out = mlx5_vzalloc(outlen);
 	if (!out)
-		return;
+		return -ENOMEM;
 
 	out_addr = MLX5_ADDR_OF(query_nic_vport_context_out, out,
 				nic_vport_context.permanent_address);
 
-	memset(in, 0, sizeof(in));
-
-	MLX5_SET(query_nic_vport_context_in, in, opcode,
-		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
-
-	memset(out, 0, outlen);
-	mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out, outlen);
+	err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
+	if (err)
+		goto out;
 
 	ether_addr_copy(addr, &out_addr[2]);
 
+out:
 	kvfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_address);
+
+int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
+				      u16 vport, u8 *addr)
+{
+	void *in;
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	int err;
+	void *nic_vport_ctx;
+	u8 *perm_mac;
+
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		mlx5_core_warn(mdev, "failed to allocate inbox\n");
+		return -ENOMEM;
+	}
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.permanent_address, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+
+	if (vport)
+		MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in,
+				     in, nic_vport_context);
+	perm_mac = MLX5_ADDR_OF(nic_vport_context, nic_vport_ctx,
+				permanent_address);
+
+	ether_addr_copy(&perm_mac[2], addr);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
 }
-EXPORT_SYMBOL(mlx5_query_nic_vport_mac_address);
+EXPORT_SYMBOL(mlx5_modify_nic_vport_mac_address);
 
 int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
 			     u8 port_num, u16  vf_num, u16 gid_index,
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 967e0fd06e89..43e82d9f5463 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -36,7 +36,10 @@
 #include <linux/mlx5/driver.h>
 
 u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod);
-void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr);
+int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
+				     u16 vport, u8 *addr);
+int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,
+				      u16 vport, u8 *addr);
 int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
 			     u8 port_num, u16  vf_num, u16 gid_index,
 			     union ib_gid *gid);
-- 
cgit v1.2.3-71-gd317


From e16aea2744abea612c27ee0eef606c6a6a8204de Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:12 +0200
Subject: net/mlx5: Introduce access functions to modify/query vport mac lists

Those functions are needed to notify the upcoming L2 table and SR-IOV
E-Switch(FDB) manager(PF), of the NIC vport (vf) UC/MC mac lists
changes.

preperation for ethernet sriov and l2 table management.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 119 ++++++++++++++++++++++++
 include/linux/mlx5/device.h                     |   6 ++
 include/linux/mlx5/vport.h                      |  10 ++
 3 files changed, 135 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 442916e98724..986d0d364df7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -150,6 +150,125 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 }
 EXPORT_SYMBOL(mlx5_modify_nic_vport_mac_address);
 
+int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
+				  u32 vport,
+				  enum mlx5_list_type list_type,
+				  u8 addr_list[][ETH_ALEN],
+				  int *list_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)];
+	void *nic_vport_ctx;
+	int max_list_size;
+	int req_list_size;
+	int out_sz;
+	void *out;
+	int err;
+	int i;
+
+	req_list_size = *list_size;
+
+	max_list_size = list_type == MLX5_NVPRT_LIST_TYPE_UC ?
+		1 << MLX5_CAP_GEN(dev, log_max_current_uc_list) :
+		1 << MLX5_CAP_GEN(dev, log_max_current_mc_list);
+
+	if (req_list_size > max_list_size) {
+		mlx5_core_warn(dev, "Requested list size (%d) > (%d) max_list_size\n",
+			       req_list_size, max_list_size);
+		req_list_size = max_list_size;
+	}
+
+	out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+			req_list_size * MLX5_ST_SZ_BYTES(mac_address_layout);
+
+	memset(in, 0, sizeof(in));
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, allowed_list_type, list_type);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+
+	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, out_sz);
+	if (err)
+		goto out;
+
+	nic_vport_ctx = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+				     nic_vport_context);
+	req_list_size = MLX5_GET(nic_vport_context, nic_vport_ctx,
+				 allowed_list_size);
+
+	*list_size = req_list_size;
+	for (i = 0; i < req_list_size; i++) {
+		u8 *mac_addr = MLX5_ADDR_OF(nic_vport_context,
+					nic_vport_ctx,
+					current_uc_mac_address[i]) + 2;
+		ether_addr_copy(addr_list[i], mac_addr);
+	}
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_list);
+
+int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev,
+				   enum mlx5_list_type list_type,
+				   u8 addr_list[][ETH_ALEN],
+				   int list_size)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)];
+	void *nic_vport_ctx;
+	int max_list_size;
+	int in_sz;
+	void *in;
+	int err;
+	int i;
+
+	max_list_size = list_type == MLX5_NVPRT_LIST_TYPE_UC ?
+		 1 << MLX5_CAP_GEN(dev, log_max_current_uc_list) :
+		 1 << MLX5_CAP_GEN(dev, log_max_current_mc_list);
+
+	if (list_size > max_list_size)
+		return -ENOSPC;
+
+	in_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+		list_size * MLX5_ST_SZ_BYTES(mac_address_layout);
+
+	memset(out, 0, sizeof(out));
+	in = kzalloc(in_sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.addresses_list, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, in,
+				     nic_vport_context);
+
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_type, list_type);
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_size, list_size);
+
+	for (i = 0; i < list_size; i++) {
+		u8 *curr_mac = MLX5_ADDR_OF(nic_vport_context,
+					    nic_vport_ctx,
+					    current_uc_mac_address[i]) + 2;
+		ether_addr_copy(curr_mac, addr_list[i]);
+	}
+
+	err = mlx5_cmd_exec_check_status(dev, in, in_sz, out, sizeof(out));
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_list);
+
 int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
 			     u8 port_num, u16  vf_num, u16 gid_index,
 			     union ib_gid *gid)
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 0b473cbfa7ef..0d2f0435a9f0 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1102,6 +1102,12 @@ enum {
 	MLX5_FLOW_CONTEXT_DEST_TYPE_TIR		= 2,
 };
 
+enum mlx5_list_type {
+	MLX5_NVPRT_LIST_TYPE_UC   = 0x0,
+	MLX5_NVPRT_LIST_TYPE_MC   = 0x1,
+	MLX5_NVPRT_LIST_TYPE_VLAN = 0x2,
+};
+
 enum {
 	MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE = 0x0,
 	MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM    = 0x1,
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 43e82d9f5463..00bbec8d9527 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -34,6 +34,7 @@
 #define __MLX5_VPORT_H__
 
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
 
 u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod);
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
@@ -54,5 +55,14 @@ int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev,
 					   u64 *sys_image_guid);
 int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev,
 				   u64 *node_guid);
+int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
+				  u32 vport,
+				  enum mlx5_list_type list_type,
+				  u8 addr_list[][ETH_ALEN],
+				  int *list_size);
+int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev,
+				   enum mlx5_list_type list_type,
+				   u8 addr_list[][ETH_ALEN],
+				   int list_size);
 
 #endif /* __MLX5_VPORT_H__ */
-- 
cgit v1.2.3-71-gd317


From e75465148b7df7f2796c75bf98bf33f171edeb2b Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:13 +0200
Subject: net/mlx5: Introduce access functions to modify/query vport state

In preparation for SR-IOV we add here an API to enable each e-switch
manager (PF) to configure its VFs link states in e-switch

preparation for ethernet sriov.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/vport.c   | 61 ++++++++++++++++++++---
 include/linux/mlx5/mlx5_ifc.h                     |  1 +
 include/linux/mlx5/vport.h                        |  6 ++-
 4 files changed, 62 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2ef717f98b0d..007e464b3e58 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -63,7 +63,7 @@ static void mlx5e_update_carrier(struct mlx5e_priv *priv)
 	u8 port_state;
 
 	port_state = mlx5_query_vport_state(mdev,
-		MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT);
+		MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0);
 
 	if (port_state == VPORT_STATE_UP)
 		netif_carrier_on(priv->netdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 986d0d364df7..b017a7e68b28 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -36,26 +36,75 @@
 #include <linux/mlx5/vport.h>
 #include "mlx5_core.h"
 
-u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod)
+static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod,
+				   u16 vport, u32 *out, int outlen)
 {
-	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)];
-	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)];
 	int err;
+	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)];
 
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(query_vport_state_in, in, opcode,
 		 MLX5_CMD_OP_QUERY_VPORT_STATE);
 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
+	MLX5_SET(query_vport_state_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_vport_state_in, in, other_vport, 1);
 
-	err = mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out,
-					 sizeof(out));
+	err = mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out, outlen);
 	if (err)
 		mlx5_core_warn(mdev, "MLX5_CMD_OP_QUERY_VPORT_STATE failed\n");
 
+	return err;
+}
+
+u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {0};
+
+	_mlx5_query_vport_state(mdev, opmod, vport, out, sizeof(out));
+
 	return MLX5_GET(query_vport_state_out, out, state);
 }
-EXPORT_SYMBOL(mlx5_query_vport_state);
+EXPORT_SYMBOL_GPL(mlx5_query_vport_state);
+
+u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {0};
+
+	_mlx5_query_vport_state(mdev, opmod, vport, out, sizeof(out));
+
+	return MLX5_GET(query_vport_state_out, out, admin_state);
+}
+EXPORT_SYMBOL(mlx5_query_vport_admin_state);
+
+int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
+				  u16 vport, u8 state)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)];
+	u32 out[MLX5_ST_SZ_DW(modify_vport_state_out)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(modify_vport_state_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_VPORT_STATE);
+	MLX5_SET(modify_vport_state_in, in, op_mod, opmod);
+	MLX5_SET(modify_vport_state_in, in, vport_number, vport);
+
+	if (vport)
+		MLX5_SET(modify_vport_state_in, in, other_vport, 1);
+
+	MLX5_SET(modify_vport_state_in, in, admin_state, state);
+
+	err = mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out,
+					 sizeof(out));
+	if (err)
+		mlx5_core_warn(mdev, "MLX5_CMD_OP_MODIFY_VPORT_STATE failed\n");
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_modify_vport_admin_state);
 
 static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
 					u32 *out, int outlen)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 836cf0e43174..655184702ea2 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2946,6 +2946,7 @@ struct mlx5_ifc_query_vport_state_out_bits {
 
 enum {
 	MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT  = 0x0,
+	MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT   = 0x1,
 };
 
 struct mlx5_ifc_query_vport_state_in_bits {
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 00bbec8d9527..c1bba5948851 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -36,7 +36,11 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/device.h>
 
-u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod);
+u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
+u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
+				u16 vport);
+int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
+				  u16 vport, u8 state);
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 				     u16 vport, u8 *addr);
 int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,
-- 
cgit v1.2.3-71-gd317


From d82b73186dab70d6d332dd2afdb48608be2e5230 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:14 +0200
Subject: net/mlx5: Introduce access functions to modify/query vport promisc
 mode

Those functions are needed to notify the upcoming SR-IOV
E-Switch(FDB) manager(PF), of the NIC vport (vf) promisc mode changes.

Preperation for ethernet sriov and l2 table management.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 62 +++++++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                   | 28 +++++++++--
 include/linux/mlx5/vport.h                      |  9 ++++
 3 files changed, 94 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index b017a7e68b28..68aa51df29c1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -576,3 +576,65 @@ int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev,
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_node_guid);
+
+int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev,
+				 u32 vport,
+				 int *promisc_uc,
+				 int *promisc_mc,
+				 int *promisc_all)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
+	if (err)
+		goto out;
+
+	*promisc_uc = MLX5_GET(query_nic_vport_context_out, out,
+			       nic_vport_context.promisc_uc);
+	*promisc_mc = MLX5_GET(query_nic_vport_context_out, out,
+			       nic_vport_context.promisc_mc);
+	*promisc_all = MLX5_GET(query_nic_vport_context_out, out,
+				nic_vport_context.promisc_all);
+
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_promisc);
+
+int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
+				  int promisc_uc,
+				  int promisc_mc,
+				  int promisc_all)
+{
+	void *in;
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	int err;
+
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		mlx5_core_err(mdev, "failed to allocate inbox\n");
+		return -ENOMEM;
+	}
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.promisc, 1);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.promisc_uc, promisc_uc);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.promisc_mc, promisc_mc);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.promisc_all, promisc_all);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 655184702ea2..2728b5f6c017 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2147,16 +2147,31 @@ struct mlx5_ifc_nic_vport_context_bits {
 	u8         reserved_0[0x1f];
 	u8         roce_en[0x1];
 
-	u8         reserved_1[0x760];
+	u8         arm_change_event[0x1];
+	u8         reserved_1[0x1a];
+	u8         event_on_mtu[0x1];
+	u8         event_on_promisc_change[0x1];
+	u8         event_on_vlan_change[0x1];
+	u8         event_on_mc_address_change[0x1];
+	u8         event_on_uc_address_change[0x1];
 
-	u8         reserved_2[0x5];
+	u8         reserved_2[0xf0];
+
+	u8         mtu[0x10];
+
+	u8         reserved_3[0x640];
+
+	u8         promisc_uc[0x1];
+	u8         promisc_mc[0x1];
+	u8         promisc_all[0x1];
+	u8         reserved_4[0x2];
 	u8         allowed_list_type[0x3];
-	u8         reserved_3[0xc];
+	u8         reserved_5[0xc];
 	u8         allowed_list_size[0xc];
 
 	struct mlx5_ifc_mac_address_layout_bits permanent_address;
 
-	u8         reserved_4[0x20];
+	u8         reserved_6[0x20];
 
 	u8         current_uc_mac_address[0][0x40];
 };
@@ -4235,7 +4250,10 @@ struct mlx5_ifc_modify_nic_vport_context_out_bits {
 };
 
 struct mlx5_ifc_modify_nic_vport_field_select_bits {
-	u8         reserved_0[0x1c];
+	u8         reserved_0[0x19];
+	u8         mtu[0x1];
+	u8         change_event[0x1];
+	u8         promisc[0x1];
 	u8         permanent_address[0x1];
 	u8         addresses_list[0x1];
 	u8         roce_en[0x1];
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index c1bba5948851..dbbaed9f975a 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -68,5 +68,14 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev,
 				   enum mlx5_list_type list_type,
 				   u8 addr_list[][ETH_ALEN],
 				   int list_size);
+int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev,
+				 u32 vport,
+				 int *promisc_uc,
+				 int *promisc_mc,
+				 int *promisc_all);
+int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
+				  int promisc_uc,
+				  int promisc_mc,
+				  int promisc_all);
 
 #endif /* __MLX5_VPORT_H__ */
-- 
cgit v1.2.3-71-gd317


From c0046cf7b81ac55b8bf056c71918ec04edd99379 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:15 +0200
Subject: net/mlx5: Introduce access functions to modify/query vport vlans

Those functions are needed to notify the upcoming L2 table and SR-IOV
E-Switch(FDB) manager(PF), of the NIC vport (vf) vlan table changes.

preperation for ethernet sriov and l2 table management.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 112 ++++++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                   |   7 ++
 include/linux/mlx5/vport.h                      |   7 ++
 3 files changed, 126 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 68aa51df29c1..076197efea9b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -318,6 +318,118 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_list);
 
+int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev,
+			       u32 vport,
+			       u16 vlans[],
+			       int *size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)];
+	void *nic_vport_ctx;
+	int req_list_size;
+	int max_list_size;
+	int out_sz;
+	void *out;
+	int err;
+	int i;
+
+	req_list_size = *size;
+	max_list_size = 1 << MLX5_CAP_GEN(dev, log_max_vlan_list);
+	if (req_list_size > max_list_size) {
+		mlx5_core_warn(dev, "Requested list size (%d) > (%d) max list size\n",
+			       req_list_size, max_list_size);
+		req_list_size = max_list_size;
+	}
+
+	out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+			req_list_size * MLX5_ST_SZ_BYTES(vlan_layout);
+
+	memset(in, 0, sizeof(in));
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, allowed_list_type,
+		 MLX5_NVPRT_LIST_TYPE_VLAN);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+
+	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, out_sz);
+	if (err)
+		goto out;
+
+	nic_vport_ctx = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+				     nic_vport_context);
+	req_list_size = MLX5_GET(nic_vport_context, nic_vport_ctx,
+				 allowed_list_size);
+
+	*size = req_list_size;
+	for (i = 0; i < req_list_size; i++) {
+		void *vlan_addr = MLX5_ADDR_OF(nic_vport_context,
+					       nic_vport_ctx,
+					       current_uc_mac_address[i]);
+		vlans[i] = MLX5_GET(vlan_layout, vlan_addr, vlan);
+	}
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_vlans);
+
+int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
+				u16 vlans[],
+				int list_size)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)];
+	void *nic_vport_ctx;
+	int max_list_size;
+	int in_sz;
+	void *in;
+	int err;
+	int i;
+
+	max_list_size = 1 << MLX5_CAP_GEN(dev, log_max_vlan_list);
+
+	if (list_size > max_list_size)
+		return -ENOSPC;
+
+	in_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+		list_size * MLX5_ST_SZ_BYTES(vlan_layout);
+
+	memset(out, 0, sizeof(out));
+	in = kzalloc(in_sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.addresses_list, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, in,
+				     nic_vport_context);
+
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_type, MLX5_NVPRT_LIST_TYPE_VLAN);
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_size, list_size);
+
+	for (i = 0; i < list_size; i++) {
+		void *vlan_addr = MLX5_ADDR_OF(nic_vport_context,
+					       nic_vport_ctx,
+					       current_uc_mac_address[i]);
+		MLX5_SET(vlan_layout, vlan_addr, vlan, vlans[i]);
+	}
+
+	err = mlx5_cmd_exec_check_status(dev, in, in_sz, out, sizeof(out));
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_vlans);
+
 int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
 			     u8 port_num, u16  vf_num, u16 gid_index,
 			     union ib_gid *gid)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 2728b5f6c017..39487d0c305d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -910,6 +910,13 @@ struct mlx5_ifc_mac_address_layout_bits {
 	u8         mac_addr_31_0[0x20];
 };
 
+struct mlx5_ifc_vlan_layout_bits {
+	u8         reserved_0[0x14];
+	u8         vlan[0x0c];
+
+	u8         reserved_1[0x20];
+};
+
 struct mlx5_ifc_cong_control_r_roce_ecn_np_bits {
 	u8         reserved_0[0xa0];
 
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index dbbaed9f975a..638f2ca7a527 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -77,5 +77,12 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
 				  int promisc_uc,
 				  int promisc_mc,
 				  int promisc_all);
+int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev,
+			       u32 vport,
+			       u16 vlans[],
+			       int *size);
+int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
+				u16 vlans[],
+				int list_size);
 
 #endif /* __MLX5_VPORT_H__ */
-- 
cgit v1.2.3-71-gd317


From 073bb189a41d7bbad509b576a690611c46c4858f Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:18 +0200
Subject: net/mlx5: Introducing E-Switch and l2 table

E-Switch is the software entity that represents and manages ConnectX4
inter-HCA ethernet l2 switching.

E-Switch has its own Virtual Ports, each Vport/vNIC/VF can be
connected to the device through a vport of an e-switch.

Each e-switch is managed by one vNIC identified by
HCA_CAP.vport_group_manager (usually it is the PF/vport[0]),
and its main responsibility is to forward each packet to the
right vport.

e-Switch needs to manage its own l2-table and FDB tables.

L2 table is a flow table that is managed by FW, it is needed for
Multi-host (Multi PF) configuration for inter HCA switching between
PFs.

FDB table is a flow table that is totally managed by e-Switch driver,
its main responsibility is to switch packets between e-Swtich internal
vports and uplink vport that belong to the same.

This patch introduces only e-Swtich l2 table management, FDB managemnt
will come later when ethernet SRIOV/VFs will be enabled.

preperation for ethernet sriov and l2 table management.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c      |  13 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 500 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 122 ++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c    |  18 +
 include/linux/mlx5/device.h                       |   8 +
 include/linux/mlx5/driver.h                       |   4 +
 7 files changed, 666 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 4d5103911527..a0755919ccaf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -3,6 +3,6 @@ obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
 		mad.o transobj.o vport.o sriov.o
-mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o flow_table.o \
+mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o flow_table.o eswitch.o \
 		en_main.o en_flow_table.o en_ethtool.o en_tx.o en_rx.o \
 		en_txrx.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 713ead583347..23c244a7e5d7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -35,6 +35,9 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
+#ifdef CONFIG_MLX5_CORE_EN
+#include "eswitch.h"
+#endif
 
 enum {
 	MLX5_EQE_SIZE		= sizeof(struct mlx5_eqe),
@@ -287,6 +290,11 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 			break;
 #endif
 
+#ifdef CONFIG_MLX5_CORE_EN
+		case MLX5_EVENT_TYPE_NIC_VPORT_CHANGE:
+			mlx5_eswitch_vport_event(dev->priv.eswitch, eqe);
+			break;
+#endif
 		default:
 			mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
 				       eqe->type, eq->eqn);
@@ -459,6 +467,11 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, pg))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT);
 
+	if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH &&
+	    MLX5_CAP_GEN(dev, vport_group_manager) &&
+	    mlx5_core_is_pf(dev))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_NIC_VPORT_CHANGE);
+
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
 				 "mlx5_cmd_eq", &dev->priv.uuari.uars[0]);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
new file mode 100644
index 000000000000..1f2f804bde3e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/vport.h>
+#include "mlx5_core.h"
+#include "eswitch.h"
+
+#define MLX5_DEBUG_ESWITCH_MASK BIT(3)
+
+#define esw_info(dev, format, ...)				\
+	pr_info("(%s): E-Switch: " format, (dev)->priv.name, ##__VA_ARGS__)
+
+#define esw_warn(dev, format, ...)				\
+	pr_warn("(%s): E-Switch: " format, (dev)->priv.name, ##__VA_ARGS__)
+
+#define esw_debug(dev, format, ...)				\
+	mlx5_core_dbg_mask(dev, MLX5_DEBUG_ESWITCH_MASK, format, ##__VA_ARGS__)
+
+enum {
+	MLX5_ACTION_NONE = 0,
+	MLX5_ACTION_ADD  = 1,
+	MLX5_ACTION_DEL  = 2,
+};
+
+/* HW UC L2 table hash node */
+struct mlx5_uc_l2addr {
+	struct l2addr_node node;
+	u8                 action;
+	u32                table_index;
+	u32                vport;
+};
+
+/* Vport UC L2 table hash node */
+struct mlx5_vport_addr {
+	struct l2addr_node node;
+	u8                 action;
+};
+
+enum {
+	UC_ADDR_CHANGE = BIT(0),
+	MC_ADDR_CHANGE = BIT(1),
+};
+
+static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, int vport,
+					u32 events_mask)
+{
+	int in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)];
+	int out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)];
+	void *nic_vport_ctx;
+	int err;
+
+	memset(out, 0, sizeof(out));
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 opcode, MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.change_event, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in,
+				     in, nic_vport_context);
+
+	MLX5_SET(nic_vport_context, nic_vport_ctx, arm_change_event, 1);
+
+	if (events_mask & UC_ADDR_CHANGE)
+		MLX5_SET(nic_vport_context, nic_vport_ctx,
+			 event_on_uc_address_change, 1);
+	if (events_mask & MC_ADDR_CHANGE)
+		MLX5_SET(nic_vport_context, nic_vport_ctx,
+			 event_on_mc_address_change, 1);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		goto ex;
+	err = mlx5_cmd_status_to_err_v2(out);
+	if (err)
+		goto ex;
+	return 0;
+ex:
+	return err;
+}
+
+/* HW L2 Table (MPFS) management */
+static int set_l2_table_entry_cmd(struct mlx5_core_dev *dev, u32 index,
+				  u8 *mac, u8 vlan_valid, u16 vlan)
+{
+	u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)];
+	u32 out[MLX5_ST_SZ_DW(set_l2_table_entry_out)];
+	u8 *in_mac_addr;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(set_l2_table_entry_in, in, opcode,
+		 MLX5_CMD_OP_SET_L2_TABLE_ENTRY);
+	MLX5_SET(set_l2_table_entry_in, in, table_index, index);
+	MLX5_SET(set_l2_table_entry_in, in, vlan_valid, vlan_valid);
+	MLX5_SET(set_l2_table_entry_in, in, vlan, vlan);
+
+	in_mac_addr = MLX5_ADDR_OF(set_l2_table_entry_in, in, mac_address);
+	ether_addr_copy(&in_mac_addr[2], mac);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in),
+					  out, sizeof(out));
+}
+
+static int del_l2_table_entry_cmd(struct mlx5_core_dev *dev, u32 index)
+{
+	u32 in[MLX5_ST_SZ_DW(delete_l2_table_entry_in)];
+	u32 out[MLX5_ST_SZ_DW(delete_l2_table_entry_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(delete_l2_table_entry_in, in, opcode,
+		 MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY);
+	MLX5_SET(delete_l2_table_entry_in, in, table_index, index);
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in),
+					  out, sizeof(out));
+}
+
+static int alloc_l2_table_index(struct mlx5_l2_table *l2_table, u32 *ix)
+{
+	int err = 0;
+
+	*ix = find_first_zero_bit(l2_table->bitmap, l2_table->size);
+	if (*ix >= l2_table->size)
+		err = -ENOSPC;
+	else
+		__set_bit(*ix, l2_table->bitmap);
+
+	return err;
+}
+
+static void free_l2_table_index(struct mlx5_l2_table *l2_table, u32 ix)
+{
+	__clear_bit(ix, l2_table->bitmap);
+}
+
+static int set_l2_table_entry(struct mlx5_core_dev *dev, u8 *mac,
+			      u8 vlan_valid, u16 vlan,
+			      u32 *index)
+{
+	struct mlx5_l2_table *l2_table = &dev->priv.eswitch->l2_table;
+	int err;
+
+	err = alloc_l2_table_index(l2_table, index);
+	if (err)
+		return err;
+
+	err = set_l2_table_entry_cmd(dev, *index, mac, vlan_valid, vlan);
+	if (err)
+		free_l2_table_index(l2_table, *index);
+
+	return err;
+}
+
+static void del_l2_table_entry(struct mlx5_core_dev *dev, u32 index)
+{
+	struct mlx5_l2_table *l2_table = &dev->priv.eswitch->l2_table;
+
+	del_l2_table_entry_cmd(dev, index);
+	free_l2_table_index(l2_table, index);
+}
+
+/* SW E-Switch L2 Table management */
+static int l2_table_addr_add(struct mlx5_eswitch *esw,
+			     u8 mac[ETH_ALEN], u32 vport)
+{
+	struct hlist_head *hash;
+	struct mlx5_uc_l2addr *addr;
+	int err;
+
+	hash = esw->l2_table.l2_hash;
+	addr = l2addr_hash_find(hash, mac, struct mlx5_uc_l2addr);
+	if (addr) {
+		esw_warn(esw->dev,
+			 "Failed to set L2 mac(%pM) for vport(%d), mac is already in use by vport(%d)\n",
+			 mac, vport, addr->vport);
+		return -EEXIST;
+	}
+
+	addr = l2addr_hash_add(hash, mac, struct mlx5_uc_l2addr,
+			       GFP_KERNEL);
+	if (!addr)
+		return -ENOMEM;
+
+	addr->vport = vport;
+	addr->action = MLX5_ACTION_NONE;
+	err = set_l2_table_entry(esw->dev, mac, 0, 0, &addr->table_index);
+	if (err)
+		l2addr_hash_del(addr);
+	else
+		esw_debug(esw->dev, "\tADDED L2 MAC: vport[%d] %pM index:%d\n",
+			  vport, addr->node.addr, addr->table_index);
+	return err;
+}
+
+static int l2_table_addr_del(struct mlx5_eswitch *esw,
+			     u8 mac[ETH_ALEN], u32 vport)
+{
+	struct hlist_head *hash;
+	struct mlx5_uc_l2addr *addr;
+
+	hash = esw->l2_table.l2_hash;
+	addr = l2addr_hash_find(hash, mac, struct mlx5_uc_l2addr);
+	if (!addr || addr->vport != vport) {
+		esw_warn(esw->dev, "MAC(%pM) doesn't belong to vport (%d)\n",
+			 mac, vport);
+		return -EINVAL;
+	}
+
+	esw_debug(esw->dev, "\tDELETE L2 MAC: vport[%d] %pM index:%d\n",
+		  vport, addr->node.addr, addr->table_index);
+	del_l2_table_entry(esw->dev, addr->table_index);
+	l2addr_hash_del(addr);
+	return 0;
+}
+
+/* E-Switch vport uc list management */
+
+/* Apply vport uc list to HW l2 table */
+static void esw_apply_vport_uc_list(struct mlx5_eswitch *esw,
+				    u32 vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	struct mlx5_vport_addr *addr;
+	struct l2addr_node *node;
+	struct hlist_head *hash;
+	struct hlist_node *tmp;
+	int hi;
+
+	hash = vport->uc_list;
+	for_each_l2hash_node(node, tmp, hash, hi) {
+		addr = container_of(node, struct mlx5_vport_addr, node);
+		switch (addr->action) {
+		case MLX5_ACTION_ADD:
+			l2_table_addr_add(esw, addr->node.addr, vport_num);
+			addr->action = MLX5_ACTION_NONE;
+			break;
+		case MLX5_ACTION_DEL:
+			l2_table_addr_del(esw, addr->node.addr, vport_num);
+			l2addr_hash_del(addr);
+			break;
+		}
+	}
+}
+
+/* Sync vport uc list from vport context */
+static void esw_update_vport_uc_list(struct mlx5_eswitch *esw,
+				     u32 vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	struct mlx5_vport_addr *addr;
+	struct l2addr_node *node;
+	u8 (*mac_list)[ETH_ALEN];
+	struct hlist_head *hash;
+	struct hlist_node *tmp;
+	int size;
+	int err;
+	int hi;
+	int i;
+
+	size = MLX5_MAX_UC_PER_VPORT(esw->dev);
+
+	mac_list = kcalloc(size, ETH_ALEN, GFP_KERNEL);
+	if (!mac_list)
+		return;
+
+	hash = vport->uc_list;
+
+	for_each_l2hash_node(node, tmp, hash, hi) {
+		addr = container_of(node, struct mlx5_vport_addr, node);
+		addr->action = MLX5_ACTION_DEL;
+	}
+
+	err = mlx5_query_nic_vport_mac_list(esw->dev, vport_num,
+					    MLX5_NVPRT_LIST_TYPE_UC,
+					    mac_list, &size);
+	if (err)
+		return;
+	esw_debug(esw->dev, "vport[%d] context update UC list size (%d)\n",
+		  vport_num, size);
+
+	for (i = 0; i < size; i++) {
+		if (!is_valid_ether_addr(mac_list[i]))
+			continue;
+
+		addr = l2addr_hash_find(hash, mac_list[i],
+					struct mlx5_vport_addr);
+		if (addr) {
+			addr->action = MLX5_ACTION_NONE;
+			continue;
+		}
+
+		addr = l2addr_hash_add(hash, mac_list[i],
+				       struct mlx5_vport_addr,
+				       GFP_KERNEL);
+		if (!addr) {
+			esw_warn(esw->dev,
+				 "Failed to add MAC(%pM) to vport[%d] DB\n",
+				 mac_list[i], vport_num);
+			continue;
+		}
+		addr->action = MLX5_ACTION_ADD;
+	}
+	kfree(mac_list);
+}
+
+static void esw_vport_change_handler(struct work_struct *work)
+{
+	struct mlx5_vport *vport =
+		container_of(work, struct mlx5_vport, vport_change_handler);
+	struct mlx5_core_dev *dev = vport->dev;
+	u8 mac[ETH_ALEN];
+
+	mlx5_query_nic_vport_mac_address(dev, vport->vport, mac);
+
+	if (!is_valid_ether_addr(mac))
+		goto out;
+
+	esw_update_vport_uc_list(dev->priv.eswitch, vport->vport);
+	esw_apply_vport_uc_list(dev->priv.eswitch, vport->vport);
+
+out:
+	if (vport->enabled)
+		arm_vport_context_events_cmd(dev, vport->vport,
+					     UC_ADDR_CHANGE);
+}
+
+static void esw_enable_vport(struct mlx5_eswitch *esw, int vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	unsigned long flags;
+
+	spin_lock_irqsave(&vport->lock, flags);
+	vport->enabled = true;
+	spin_unlock_irqrestore(&vport->lock, flags);
+
+	arm_vport_context_events_cmd(esw->dev, vport_num, UC_ADDR_CHANGE);
+}
+
+static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	unsigned long flags;
+
+	if (!vport->enabled)
+		return;
+
+	/* Mark this vport as disabled to discard new events */
+	spin_lock_irqsave(&vport->lock, flags);
+	vport->enabled = false;
+	spin_unlock_irqrestore(&vport->lock, flags);
+
+	/* Wait for current already scheduled events to complete */
+	flush_workqueue(esw->work_queue);
+
+	/* Disable events from this vport */
+	arm_vport_context_events_cmd(esw->dev, vport->vport, 0);
+}
+
+/* Public E-Switch API */
+int mlx5_eswitch_init(struct mlx5_core_dev *dev)
+{
+	int l2_table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
+	int total_vports = 1 + pci_sriov_get_totalvfs(dev->pdev);
+	struct mlx5_eswitch *esw;
+	int vport_num;
+	int err;
+
+	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
+	    MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return 0;
+
+	esw_info(dev,
+		 "Total vports %d, l2 table size(%d), per vport: max uc(%d) max mc(%d)\n",
+		 total_vports, l2_table_size,
+		 MLX5_MAX_UC_PER_VPORT(dev),
+		 MLX5_MAX_MC_PER_VPORT(dev));
+
+	esw = kzalloc(sizeof(*esw), GFP_KERNEL);
+	if (!esw)
+		return -ENOMEM;
+
+	esw->dev = dev;
+
+	esw->l2_table.bitmap = kcalloc(BITS_TO_LONGS(l2_table_size),
+				   sizeof(uintptr_t), GFP_KERNEL);
+	if (!esw->l2_table.bitmap) {
+		err = -ENOMEM;
+		goto abort;
+	}
+	esw->l2_table.size = l2_table_size;
+
+	esw->work_queue = create_singlethread_workqueue("mlx5_esw_wq");
+	if (!esw->work_queue) {
+		err = -ENOMEM;
+		goto abort;
+	}
+
+	esw->vports = kcalloc(total_vports, sizeof(struct mlx5_vport),
+			      GFP_KERNEL);
+	if (!esw->vports) {
+		err = -ENOMEM;
+		goto abort;
+	}
+
+	esw->total_vports = total_vports;
+	for (vport_num = 0; vport_num < total_vports; vport_num++) {
+		struct mlx5_vport *vport = &esw->vports[vport_num];
+
+		vport->vport = vport_num;
+		vport->dev = dev;
+		INIT_WORK(&vport->vport_change_handler,
+			  esw_vport_change_handler);
+		spin_lock_init(&vport->lock);
+	}
+
+	dev->priv.eswitch = esw;
+
+	esw_enable_vport(esw, 0);
+	/* VF Vports will be enabled when SRIOV is enabled */
+	return 0;
+abort:
+	if (esw->work_queue)
+		destroy_workqueue(esw->work_queue);
+	kfree(esw->l2_table.bitmap);
+	kfree(esw->vports);
+	kfree(esw);
+	return err;
+}
+
+void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
+{
+	if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager) ||
+	    MLX5_CAP_GEN(esw->dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return;
+
+	esw_info(esw->dev, "cleanup\n");
+	esw_disable_vport(esw, 0);
+
+	esw->dev->priv.eswitch = NULL;
+	destroy_workqueue(esw->work_queue);
+	kfree(esw->l2_table.bitmap);
+	kfree(esw->vports);
+	kfree(esw);
+}
+
+void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe)
+{
+	struct mlx5_eqe_vport_change *vc_eqe = &eqe->data.vport_change;
+	u16 vport_num = be16_to_cpu(vc_eqe->vport_num);
+	struct mlx5_vport *vport;
+
+	if (!esw) {
+		pr_warn("MLX5 E-Switch: vport %d got an event while eswitch is not initialized\n",
+			vport_num);
+		return;
+	}
+
+	vport = &esw->vports[vport_num];
+	spin_lock(&vport->lock);
+	if (vport->enabled)
+		queue_work(esw->work_queue, &vport->vport_change_handler);
+	spin_unlock(&vport->lock);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
new file mode 100644
index 000000000000..0c41f2657824
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_ESWITCH_H__
+#define __MLX5_ESWITCH_H__
+
+#include <linux/mlx5/device.h>
+
+#define MLX5_MAX_UC_PER_VPORT(dev) \
+	(1 << MLX5_CAP_GEN(dev, log_max_current_uc_list))
+
+#define MLX5_MAX_MC_PER_VPORT(dev) \
+	(1 << MLX5_CAP_GEN(dev, log_max_current_mc_list))
+
+#define MLX5_L2_ADDR_HASH_SIZE (BIT(BITS_PER_BYTE))
+#define MLX5_L2_ADDR_HASH(addr) (addr[5])
+
+/* L2 -mac address based- hash helpers */
+struct l2addr_node {
+	struct hlist_node hlist;
+	u8                addr[ETH_ALEN];
+};
+
+#define for_each_l2hash_node(hn, tmp, hash, i) \
+	for (i = 0; i < MLX5_L2_ADDR_HASH_SIZE; i++) \
+		hlist_for_each_entry_safe(hn, tmp, &hash[i], hlist)
+
+#define l2addr_hash_find(hash, mac, type) ({                \
+	int ix = MLX5_L2_ADDR_HASH(mac);                    \
+	bool found = false;                                 \
+	type *ptr = NULL;                                   \
+							    \
+	hlist_for_each_entry(ptr, &hash[ix], node.hlist)    \
+		if (ether_addr_equal(ptr->node.addr, mac)) {\
+			found = true;                       \
+			break;                              \
+		}                                           \
+	if (!found)                                         \
+		ptr = NULL;                                 \
+	ptr;                                                \
+})
+
+#define l2addr_hash_add(hash, mac, type, gfp) ({            \
+	int ix = MLX5_L2_ADDR_HASH(mac);                    \
+	type *ptr = NULL;                                   \
+							    \
+	ptr = kzalloc(sizeof(type), gfp);                   \
+	if (ptr) {                                          \
+		ether_addr_copy(ptr->node.addr, mac);       \
+		hlist_add_head(&ptr->node.hlist, &hash[ix]);\
+	}                                                   \
+	ptr;                                                \
+})
+
+#define l2addr_hash_del(ptr) ({                             \
+	hlist_del(&ptr->node.hlist);                        \
+	kfree(ptr);                                         \
+})
+
+struct mlx5_vport {
+	struct mlx5_core_dev    *dev;
+	int                     vport;
+	struct hlist_head       uc_list[MLX5_L2_ADDR_HASH_SIZE];
+	struct work_struct      vport_change_handler;
+
+	/* This spinlock protects access to vport data, between
+	 * "esw_vport_disable" and ongoing interrupt "mlx5_eswitch_vport_event"
+	 * once vport marked as disabled new interrupts are discarded.
+	 */
+	spinlock_t              lock; /* vport events sync */
+	bool                    enabled;
+};
+
+struct mlx5_l2_table {
+	struct hlist_head l2_hash[MLX5_L2_ADDR_HASH_SIZE];
+	u32                  size;
+	unsigned long        *bitmap;
+};
+
+struct mlx5_eswitch {
+	struct mlx5_core_dev    *dev;
+	struct mlx5_l2_table    l2_table;
+	struct workqueue_struct *work_queue;
+	struct mlx5_vport       *vports;
+	int                     total_vports;
+};
+
+/* E-Switch API */
+int mlx5_eswitch_init(struct mlx5_core_dev *dev);
+void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
+void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe);
+
+#endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 66e2b37cfbbf..c6de3240f76f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -49,6 +49,9 @@
 #include <linux/delay.h>
 #include <linux/mlx5/mlx5_ifc.h>
 #include "mlx5_core.h"
+#ifdef CONFIG_MLX5_CORE_EN
+#include "eswitch.h"
+#endif
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox Connect-IB, ConnectX-4 core driver");
@@ -1052,6 +1055,14 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	mlx5_init_srq_table(dev);
 	mlx5_init_mr_table(dev);
 
+#ifdef CONFIG_MLX5_CORE_EN
+	err = mlx5_eswitch_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "eswitch init failed %d\n", err);
+		goto err_reg_dev;
+	}
+#endif
+
 	err = mlx5_sriov_init(dev);
 	if (err) {
 		dev_err(&pdev->dev, "sriov init failed %d\n", err);
@@ -1078,6 +1089,9 @@ err_sriov:
 	if (mlx5_sriov_cleanup(dev))
 		dev_err(&dev->pdev->dev, "sriov cleanup failed\n");
 
+#ifdef CONFIG_MLX5_CORE_EN
+	mlx5_eswitch_cleanup(dev->priv.eswitch);
+#endif
 err_reg_dev:
 	mlx5_cleanup_mr_table(dev);
 	mlx5_cleanup_srq_table(dev);
@@ -1147,6 +1161,10 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 		goto out;
 	}
 	mlx5_unregister_device(dev);
+#ifdef CONFIG_MLX5_CORE_EN
+	mlx5_eswitch_cleanup(dev->priv.eswitch);
+#endif
+
 	mlx5_cleanup_mr_table(dev);
 	mlx5_cleanup_srq_table(dev);
 	mlx5_cleanup_qp_table(dev);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 0d2f0435a9f0..90a4cb6dc4cb 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -251,6 +251,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_PAGE_REQUEST	   = 0xb,
 
 	MLX5_EVENT_TYPE_PAGE_FAULT	   = 0xc,
+	MLX5_EVENT_TYPE_NIC_VPORT_CHANGE   = 0xd,
 };
 
 enum {
@@ -520,6 +521,12 @@ struct mlx5_eqe_page_fault {
 	__be32 flags_qpn;
 } __packed;
 
+struct mlx5_eqe_vport_change {
+	u8		rsvd0[2];
+	__be16		vport_num;
+	__be32		rsvd1[6];
+} __packed;
+
 union ev_data {
 	__be32				raw[7];
 	struct mlx5_eqe_cmd		cmd;
@@ -532,6 +539,7 @@ union ev_data {
 	struct mlx5_eqe_stall_vl	stall_vl;
 	struct mlx5_eqe_page_req	req_pages;
 	struct mlx5_eqe_page_fault	page_fault;
+	struct mlx5_eqe_vport_change	vport_change;
 } __packed;
 
 struct mlx5_eqe {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index efebb87163c8..ac098b6b97bf 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -441,6 +441,8 @@ struct mlx5_irq_info {
 	char name[MLX5_MAX_IRQ_NAME];
 };
 
+struct mlx5_eswitch;
+
 struct mlx5_priv {
 	char			name[MLX5_MAX_NAME_LEN];
 	struct mlx5_eq_table	eq_table;
@@ -496,6 +498,8 @@ struct mlx5_priv {
 	struct list_head        dev_list;
 	struct list_head        ctx_list;
 	spinlock_t              ctx_lock;
+
+	struct mlx5_eswitch     *eswitch;
 	struct mlx5_core_sriov	sriov;
 	unsigned long		pci_dev_data;
 };
-- 
cgit v1.2.3-71-gd317


From 495716b191f607b2cb2175f7499966daef79f663 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:19 +0200
Subject: net/mlx5: E-Switch, Introduce FDB hardware capabilities

Define needed hardware structures and capabilities needed
for E-Switch FDB flow tables and read them on driver load.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c | 13 +++++++++++++
 include/linux/mlx5/device.h                  | 15 +++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                | 13 +++++++++++++
 3 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 9335e5ae18cc..bf6e3dfcef51 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -160,6 +160,19 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 		if (err)
 			return err;
 	}
+
+	if (MLX5_CAP_GEN(dev, vport_group_manager) &&
+	    MLX5_CAP_GEN(dev, eswitch_flow_table)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH_FLOW_TABLE,
+					 HCA_CAP_OPMOD_GET_CUR);
+		if (err)
+			return err;
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH_FLOW_TABLE,
+					 HCA_CAP_OPMOD_GET_MAX);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 90a4cb6dc4cb..bce9caed1eed 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1138,6 +1138,7 @@ enum mlx5_cap_type {
 	MLX5_CAP_IPOIB_OFFLOADS,
 	MLX5_CAP_EOIB_OFFLOADS,
 	MLX5_CAP_FLOW_TABLE,
+	MLX5_CAP_ESWITCH_FLOW_TABLE,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
 };
@@ -1175,6 +1176,20 @@ enum mlx5_cap_type {
 #define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \
 	MLX5_GET(flow_table_nic_cap, mdev->hca_caps_max[MLX5_CAP_FLOW_TABLE], cap)
 
+#define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \
+	MLX5_GET(flow_table_eswitch_cap, \
+		 mdev->hca_caps_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
+
+#define MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, cap) \
+	MLX5_GET(flow_table_eswitch_cap, \
+		 mdev->hca_caps_max[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
+
+#define MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_nic_esw_fdb.cap)
+
+#define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap)
+
 #define MLX5_CAP_ODP(mdev, cap)\
 	MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap)
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 39487d0c305d..ae7c08adba4a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -447,6 +447,18 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 	u8         reserved_3[0x7200];
 };
 
+struct mlx5_ifc_flow_table_eswitch_cap_bits {
+	u8     reserved_0[0x200];
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb;
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_ingress;
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress;
+
+	u8      reserved_1[0x7800];
+};
+
 struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         csum_cap[0x1];
 	u8         vlan_cap[0x1];
@@ -1846,6 +1858,7 @@ union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_roce_cap_bits roce_cap;
 	struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps;
 	struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap;
+	struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap;
 	u8         reserved_0[0x8000];
 };
 
-- 
cgit v1.2.3-71-gd317


From 81848731ff4070a3e4136efa6a99d507177a53fe Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:20 +0200
Subject: net/mlx5: E-Switch, Add SR-IOV (FDB) support

Enabling E-Switch SRIOV for nvfs+1 vports.

Create E-Switch FDB for L2 UC/MC mac steering between VFs/PF and
external vport (Uplink).

FDB contains forwarding rules such as:
	UC MAC0 -> vport0(PF).
	UC MAC1 -> vport1.
	UC MAC2 -> vport2.
	MC MACX -> vport0, vport2, Uplink.
	MC MACY -> vport1, Uplink.

For unmatched traffic FDB has the following default rules:
	Unmached Traffic (src vport != Uplink) -> Uplink.
	Unmached Traffic (src vport == Uplink) -> vport0(PF).

FDB rules population:
Each NIC vport (VF) will notify E-Switch manager of its UC/MC vport
context changes via modify vport context command, which will be
translated to an event that will be handled by E-Switch manager (PF)
which will update FDB table accordingly.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 682 ++++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  25 +
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c    |  14 +-
 include/linux/mlx5/device.h                        |   6 +
 include/linux/mlx5/flow_table.h                    |   9 +
 include/linux/mlx5/mlx5_ifc.h                      |   7 +-
 7 files changed, 661 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 1f2f804bde3e..6fa6f013d2c5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -34,9 +34,12 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/mlx5_ifc.h>
 #include <linux/mlx5/vport.h>
+#include <linux/mlx5/flow_table.h>
 #include "mlx5_core.h"
 #include "eswitch.h"
 
+#define UPLINK_VPORT 0xFFFF
+
 #define MLX5_DEBUG_ESWITCH_MASK BIT(3)
 
 #define esw_info(dev, format, ...)				\
@@ -54,18 +57,26 @@ enum {
 	MLX5_ACTION_DEL  = 2,
 };
 
-/* HW UC L2 table hash node */
-struct mlx5_uc_l2addr {
+/* E-Switch UC L2 table hash node */
+struct esw_uc_addr {
 	struct l2addr_node node;
-	u8                 action;
 	u32                table_index;
 	u32                vport;
 };
 
-/* Vport UC L2 table hash node */
-struct mlx5_vport_addr {
-	struct l2addr_node node;
-	u8                 action;
+/* E-Switch MC FDB table hash node */
+struct esw_mc_addr { /* SRIOV only */
+	struct l2addr_node     node;
+	struct mlx5_flow_rule *uplink_rule; /* Forward to uplink rule */
+	u32                    refcnt;
+};
+
+/* Vport UC/MC hash node */
+struct vport_addr {
+	struct l2addr_node     node;
+	u8                     action;
+	u32                    vport;
+	struct mlx5_flow_rule *flow_rule; /* SRIOV only */
 };
 
 enum {
@@ -73,7 +84,11 @@ enum {
 	MC_ADDR_CHANGE = BIT(1),
 };
 
-static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, int vport,
+/* Vport context events */
+#define SRIOV_VPORT_EVENTS (UC_ADDR_CHANGE | \
+			    MC_ADDR_CHANGE)
+
+static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport,
 					u32 events_mask)
 {
 	int in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)];
@@ -196,97 +211,492 @@ static void del_l2_table_entry(struct mlx5_core_dev *dev, u32 index)
 	free_l2_table_index(l2_table, index);
 }
 
-/* SW E-Switch L2 Table management */
-static int l2_table_addr_add(struct mlx5_eswitch *esw,
-			     u8 mac[ETH_ALEN], u32 vport)
+/* E-Switch FDB flow steering */
+struct dest_node {
+	struct list_head list;
+	struct mlx5_flow_destination dest;
+};
+
+static int _mlx5_flow_rule_apply(struct mlx5_flow_rule *fr)
 {
-	struct hlist_head *hash;
-	struct mlx5_uc_l2addr *addr;
+	bool was_valid = fr->valid;
+	struct dest_node *dest_n;
+	u32 dest_list_size = 0;
+	void *in_match_value;
+	u32 *flow_context;
+	u32 flow_index;
 	int err;
+	int i;
+
+	if (list_empty(&fr->dest_list)) {
+		if (fr->valid)
+			mlx5_del_flow_table_entry(fr->ft, fr->fi);
+		fr->valid = false;
+		return 0;
+	}
+
+	list_for_each_entry(dest_n, &fr->dest_list, list)
+		dest_list_size++;
 
-	hash = esw->l2_table.l2_hash;
-	addr = l2addr_hash_find(hash, mac, struct mlx5_uc_l2addr);
-	if (addr) {
+	flow_context = mlx5_vzalloc(MLX5_ST_SZ_BYTES(flow_context) +
+				    MLX5_ST_SZ_BYTES(dest_format_struct) *
+				    dest_list_size);
+	if (!flow_context)
+		return -ENOMEM;
+
+	MLX5_SET(flow_context, flow_context, flow_tag, fr->flow_tag);
+	MLX5_SET(flow_context, flow_context, action, fr->action);
+	MLX5_SET(flow_context, flow_context, destination_list_size,
+		 dest_list_size);
+
+	i = 0;
+	list_for_each_entry(dest_n, &fr->dest_list, list) {
+		void *dest_addr = MLX5_ADDR_OF(flow_context, flow_context,
+					       destination[i++]);
+
+		MLX5_SET(dest_format_struct, dest_addr, destination_type,
+			 dest_n->dest.type);
+		MLX5_SET(dest_format_struct, dest_addr, destination_id,
+			 dest_n->dest.vport_num);
+	}
+
+	in_match_value = MLX5_ADDR_OF(flow_context, flow_context, match_value);
+	memcpy(in_match_value, fr->match_value, MLX5_ST_SZ_BYTES(fte_match_param));
+
+	err = mlx5_add_flow_table_entry(fr->ft, fr->match_criteria_enable,
+					fr->match_criteria, flow_context,
+					&flow_index);
+	if (!err) {
+		if (was_valid)
+			mlx5_del_flow_table_entry(fr->ft, fr->fi);
+		fr->fi = flow_index;
+		fr->valid = true;
+	}
+	kfree(flow_context);
+	return err;
+}
+
+static int mlx5_flow_rule_add_dest(struct mlx5_flow_rule *fr,
+				   struct mlx5_flow_destination *new_dest)
+{
+	struct dest_node *dest_n;
+	int err;
+
+	dest_n = kzalloc(sizeof(*dest_n), GFP_KERNEL);
+	if (!dest_n)
+		return -ENOMEM;
+
+	memcpy(&dest_n->dest, new_dest, sizeof(dest_n->dest));
+	mutex_lock(&fr->mutex);
+	list_add(&dest_n->list, &fr->dest_list);
+	err = _mlx5_flow_rule_apply(fr);
+	if (err) {
+		list_del(&dest_n->list);
+		kfree(dest_n);
+	}
+	mutex_unlock(&fr->mutex);
+	return err;
+}
+
+static int mlx5_flow_rule_del_dest(struct mlx5_flow_rule *fr,
+				   struct mlx5_flow_destination *dest)
+{
+	struct dest_node *dest_n;
+	struct dest_node *n;
+	int err;
+
+	mutex_lock(&fr->mutex);
+	list_for_each_entry_safe(dest_n, n, &fr->dest_list, list) {
+		if (dest->vport_num == dest_n->dest.vport_num)
+			goto found;
+	}
+	mutex_unlock(&fr->mutex);
+	return -ENOENT;
+
+found:
+	list_del(&dest_n->list);
+	err = _mlx5_flow_rule_apply(fr);
+	mutex_unlock(&fr->mutex);
+	kfree(dest_n);
+
+	return err;
+}
+
+static struct mlx5_flow_rule *find_fr(struct mlx5_eswitch *esw,
+				      u8 match_criteria_enable,
+				      u32 *match_value)
+{
+	struct hlist_head *hash = esw->mc_table;
+	struct esw_mc_addr *esw_mc;
+	u8 *dmac_v;
+
+	dmac_v = MLX5_ADDR_OF(fte_match_param, match_value,
+			      outer_headers.dmac_47_16);
+
+	/* UNICAST FULL MATCH */
+	if (!is_multicast_ether_addr(dmac_v))
+		return NULL;
+
+	/* MULTICAST FULL MATCH */
+	esw_mc = l2addr_hash_find(hash, dmac_v, struct esw_mc_addr);
+
+	return esw_mc ? esw_mc->uplink_rule : NULL;
+}
+
+static struct mlx5_flow_rule *alloc_fr(void *ft,
+				       u8 match_criteria_enable,
+				       u32 *match_criteria,
+				       u32 *match_value,
+				       u32 action,
+				       u32 flow_tag)
+{
+	struct mlx5_flow_rule *fr = kzalloc(sizeof(*fr), GFP_KERNEL);
+
+	if (!fr)
+		return NULL;
+
+	fr->match_criteria = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+	fr->match_value = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+	if (!fr->match_criteria || !fr->match_value) {
+		kfree(fr->match_criteria);
+		kfree(fr->match_value);
+		kfree(fr);
+		return NULL;
+	}
+
+	memcpy(fr->match_criteria, match_criteria, MLX5_ST_SZ_BYTES(fte_match_param));
+	memcpy(fr->match_value, match_value, MLX5_ST_SZ_BYTES(fte_match_param));
+	fr->match_criteria_enable = match_criteria_enable;
+	fr->flow_tag = flow_tag;
+	fr->action = action;
+
+	mutex_init(&fr->mutex);
+	INIT_LIST_HEAD(&fr->dest_list);
+	atomic_set(&fr->refcount, 0);
+	fr->ft = ft;
+	return fr;
+}
+
+static void deref_fr(struct mlx5_flow_rule *fr)
+{
+	if (!atomic_dec_and_test(&fr->refcount))
+		return;
+
+	kfree(fr->match_criteria);
+	kfree(fr->match_value);
+	kfree(fr);
+}
+
+static struct mlx5_flow_rule *
+mlx5_add_flow_rule(struct mlx5_eswitch *esw,
+		   u8 match_criteria_enable,
+		   u32 *match_criteria,
+		   u32 *match_value,
+		   u32 action,
+		   u32 flow_tag,
+		   struct mlx5_flow_destination *dest)
+{
+	struct mlx5_flow_rule *fr;
+	int err;
+
+	fr = find_fr(esw, match_criteria_enable, match_value);
+	fr = fr ? fr : alloc_fr(esw->fdb_table.fdb, match_criteria_enable, match_criteria,
+				match_value, action, flow_tag);
+	if (!fr)
+		return NULL;
+
+	atomic_inc(&fr->refcount);
+
+	err = mlx5_flow_rule_add_dest(fr, dest);
+	if (err) {
+		deref_fr(fr);
+		return NULL;
+	}
+
+	return fr;
+}
+
+static void mlx5_del_flow_rule(struct mlx5_flow_rule *fr, u32 vport)
+{
+	struct mlx5_flow_destination dest;
+
+	dest.vport_num = vport;
+	mlx5_flow_rule_del_dest(fr, &dest);
+	deref_fr(fr);
+}
+
+/* E-Switch FDB */
+static struct mlx5_flow_rule *
+esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u8 mac[ETH_ALEN], u32 vport)
+{
+	int match_header = MLX5_MATCH_OUTER_HEADERS;
+	struct mlx5_flow_destination dest;
+	struct mlx5_flow_rule *flow_rule = NULL;
+	u32 *match_v;
+	u32 *match_c;
+	u8 *dmac_v;
+	u8 *dmac_c;
+
+	match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+	match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+	if (!match_v || !match_c) {
+		pr_warn("FDB: Failed to alloc match parameters\n");
+		goto out;
+	}
+	dmac_v = MLX5_ADDR_OF(fte_match_param, match_v,
+			      outer_headers.dmac_47_16);
+	dmac_c = MLX5_ADDR_OF(fte_match_param, match_c,
+			      outer_headers.dmac_47_16);
+
+	ether_addr_copy(dmac_v, mac);
+	/* Match criteria mask */
+	memset(dmac_c, 0xff, 6);
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+	dest.vport_num = vport;
+
+	esw_debug(esw->dev,
+		  "\tFDB add rule dmac_v(%pM) dmac_c(%pM) -> vport(%d)\n",
+		  dmac_v, dmac_c, vport);
+	flow_rule =
+		mlx5_add_flow_rule(esw,
+				   match_header,
+				   match_c,
+				   match_v,
+				   MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				   0, &dest);
+	if (IS_ERR_OR_NULL(flow_rule)) {
+		pr_warn(
+			"FDB: Failed to add flow rule: dmac_v(%pM) dmac_c(%pM) -> vport(%d), err(%ld)\n",
+			 dmac_v, dmac_c, vport, PTR_ERR(flow_rule));
+		flow_rule = NULL;
+	}
+out:
+	kfree(match_v);
+	kfree(match_c);
+	return flow_rule;
+}
+
+static int esw_create_fdb_table(struct mlx5_eswitch *esw, int nvports)
+{
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_table_group g;
+	struct mlx5_flow_table *fdb;
+	u8 *dmac;
+
+	esw_debug(dev, "Create FDB log_max_size(%d)\n",
+		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+
+	memset(&g, 0, sizeof(g));
+	/* UC MC Full match rules*/
+	g.log_sz = MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size);
+	g.match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	dmac = MLX5_ADDR_OF(fte_match_param, g.match_criteria,
+			    outer_headers.dmac_47_16);
+	/* Match criteria mask */
+	memset(dmac, 0xff, 6);
+
+	fdb = mlx5_create_flow_table(dev, 0,
+				     MLX5_FLOW_TABLE_TYPE_ESWITCH,
+				     1, &g);
+	if (fdb)
+		esw_debug(dev, "ESW: FDB Table created fdb->id %d\n", mlx5_get_flow_table_id(fdb));
+	else
+		esw_warn(dev, "ESW: Failed to create FDB Table\n");
+
+	esw->fdb_table.fdb = fdb;
+	return fdb ? 0 : -ENOMEM;
+}
+
+static void esw_destroy_fdb_table(struct mlx5_eswitch *esw)
+{
+	if (!esw->fdb_table.fdb)
+		return;
+
+	esw_debug(esw->dev, "Destroy FDB Table fdb(%d)\n",
+		  mlx5_get_flow_table_id(esw->fdb_table.fdb));
+	mlx5_destroy_flow_table(esw->fdb_table.fdb);
+	esw->fdb_table.fdb = NULL;
+}
+
+/* E-Switch vport UC/MC lists management */
+typedef int (*vport_addr_action)(struct mlx5_eswitch *esw,
+				 struct vport_addr *vaddr);
+
+static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	struct hlist_head *hash = esw->l2_table.l2_hash;
+	struct esw_uc_addr *esw_uc;
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+	int err;
+
+	esw_uc = l2addr_hash_find(hash, mac, struct esw_uc_addr);
+	if (esw_uc) {
 		esw_warn(esw->dev,
 			 "Failed to set L2 mac(%pM) for vport(%d), mac is already in use by vport(%d)\n",
-			 mac, vport, addr->vport);
+			 mac, vport, esw_uc->vport);
 		return -EEXIST;
 	}
 
-	addr = l2addr_hash_add(hash, mac, struct mlx5_uc_l2addr,
-			       GFP_KERNEL);
-	if (!addr)
+	esw_uc = l2addr_hash_add(hash, mac, struct esw_uc_addr, GFP_KERNEL);
+	if (!esw_uc)
 		return -ENOMEM;
+	esw_uc->vport = vport;
 
-	addr->vport = vport;
-	addr->action = MLX5_ACTION_NONE;
-	err = set_l2_table_entry(esw->dev, mac, 0, 0, &addr->table_index);
+	err = set_l2_table_entry(esw->dev, mac, 0, 0, &esw_uc->table_index);
 	if (err)
-		l2addr_hash_del(addr);
-	else
-		esw_debug(esw->dev, "\tADDED L2 MAC: vport[%d] %pM index:%d\n",
-			  vport, addr->node.addr, addr->table_index);
+		goto abort;
+
+	if (esw->fdb_table.fdb) /* SRIOV is enabled: Forward UC MAC to vport */
+		vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
+
+	esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM index:%d fr(%p)\n",
+		  vport, mac, esw_uc->table_index, vaddr->flow_rule);
+	return err;
+abort:
+	l2addr_hash_del(esw_uc);
 	return err;
 }
 
-static int l2_table_addr_del(struct mlx5_eswitch *esw,
-			     u8 mac[ETH_ALEN], u32 vport)
+static int esw_del_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 {
-	struct hlist_head *hash;
-	struct mlx5_uc_l2addr *addr;
+	struct hlist_head *hash = esw->l2_table.l2_hash;
+	struct esw_uc_addr *esw_uc;
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+
+	esw_uc = l2addr_hash_find(hash, mac, struct esw_uc_addr);
+	if (!esw_uc || esw_uc->vport != vport) {
+		esw_debug(esw->dev,
+			  "MAC(%pM) doesn't belong to vport (%d)\n",
+			  mac, vport);
+		return -EINVAL;
+	}
+	esw_debug(esw->dev, "\tDELETE UC MAC: vport[%d] %pM index:%d fr(%p)\n",
+		  vport, mac, esw_uc->table_index, vaddr->flow_rule);
+
+	del_l2_table_entry(esw->dev, esw_uc->table_index);
+
+	if (vaddr->flow_rule)
+		mlx5_del_flow_rule(vaddr->flow_rule, vport);
+	vaddr->flow_rule = NULL;
+
+	l2addr_hash_del(esw_uc);
+	return 0;
+}
+
+static int esw_add_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	struct hlist_head *hash = esw->mc_table;
+	struct esw_mc_addr *esw_mc;
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+
+	if (!esw->fdb_table.fdb)
+		return 0;
+
+	esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr);
+	if (esw_mc)
+		goto add;
+
+	esw_mc = l2addr_hash_add(hash, mac, struct esw_mc_addr, GFP_KERNEL);
+	if (!esw_mc)
+		return -ENOMEM;
+
+	esw_mc->uplink_rule = /* Forward MC MAC to Uplink */
+		esw_fdb_set_vport_rule(esw, mac, UPLINK_VPORT);
+add:
+	esw_mc->refcnt++;
+	/* Forward MC MAC to vport */
+	vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
+	esw_debug(esw->dev,
+		  "\tADDED MC MAC: vport[%d] %pM fr(%p) refcnt(%d) uplinkfr(%p)\n",
+		  vport, mac, vaddr->flow_rule,
+		  esw_mc->refcnt, esw_mc->uplink_rule);
+	return 0;
+}
+
+static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	struct hlist_head *hash = esw->mc_table;
+	struct esw_mc_addr *esw_mc;
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
 
-	hash = esw->l2_table.l2_hash;
-	addr = l2addr_hash_find(hash, mac, struct mlx5_uc_l2addr);
-	if (!addr || addr->vport != vport) {
-		esw_warn(esw->dev, "MAC(%pM) doesn't belong to vport (%d)\n",
+	if (!esw->fdb_table.fdb)
+		return 0;
+
+	esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr);
+	if (!esw_mc) {
+		esw_warn(esw->dev,
+			 "Failed to find eswitch MC addr for MAC(%pM) vport(%d)",
 			 mac, vport);
 		return -EINVAL;
 	}
+	esw_debug(esw->dev,
+		  "\tDELETE MC MAC: vport[%d] %pM fr(%p) refcnt(%d) uplinkfr(%p)\n",
+		  vport, mac, vaddr->flow_rule, esw_mc->refcnt,
+		  esw_mc->uplink_rule);
 
-	esw_debug(esw->dev, "\tDELETE L2 MAC: vport[%d] %pM index:%d\n",
-		  vport, addr->node.addr, addr->table_index);
-	del_l2_table_entry(esw->dev, addr->table_index);
-	l2addr_hash_del(addr);
+	if (vaddr->flow_rule)
+		mlx5_del_flow_rule(vaddr->flow_rule, vport);
+	vaddr->flow_rule = NULL;
+
+	if (--esw_mc->refcnt)
+		return 0;
+
+	if (esw_mc->uplink_rule)
+		mlx5_del_flow_rule(esw_mc->uplink_rule, UPLINK_VPORT);
+
+	l2addr_hash_del(esw_mc);
 	return 0;
 }
 
-/* E-Switch vport uc list management */
-
-/* Apply vport uc list to HW l2 table */
-static void esw_apply_vport_uc_list(struct mlx5_eswitch *esw,
-				    u32 vport_num)
+/* Apply vport UC/MC list to HW l2 table and FDB table */
+static void esw_apply_vport_addr_list(struct mlx5_eswitch *esw,
+				      u32 vport_num, int list_type)
 {
 	struct mlx5_vport *vport = &esw->vports[vport_num];
-	struct mlx5_vport_addr *addr;
+	bool is_uc = list_type == MLX5_NVPRT_LIST_TYPE_UC;
+	vport_addr_action vport_addr_add;
+	vport_addr_action vport_addr_del;
+	struct vport_addr *addr;
 	struct l2addr_node *node;
 	struct hlist_head *hash;
 	struct hlist_node *tmp;
 	int hi;
 
-	hash = vport->uc_list;
+	vport_addr_add = is_uc ? esw_add_uc_addr :
+				 esw_add_mc_addr;
+	vport_addr_del = is_uc ? esw_del_uc_addr :
+				 esw_del_mc_addr;
+
+	hash = is_uc ? vport->uc_list : vport->mc_list;
 	for_each_l2hash_node(node, tmp, hash, hi) {
-		addr = container_of(node, struct mlx5_vport_addr, node);
+		addr = container_of(node, struct vport_addr, node);
 		switch (addr->action) {
 		case MLX5_ACTION_ADD:
-			l2_table_addr_add(esw, addr->node.addr, vport_num);
+			vport_addr_add(esw, addr);
 			addr->action = MLX5_ACTION_NONE;
 			break;
 		case MLX5_ACTION_DEL:
-			l2_table_addr_del(esw, addr->node.addr, vport_num);
+			vport_addr_del(esw, addr);
 			l2addr_hash_del(addr);
 			break;
 		}
 	}
 }
 
-/* Sync vport uc list from vport context */
-static void esw_update_vport_uc_list(struct mlx5_eswitch *esw,
-				     u32 vport_num)
+/* Sync vport UC/MC list from vport context */
+static void esw_update_vport_addr_list(struct mlx5_eswitch *esw,
+				       u32 vport_num, int list_type)
 {
 	struct mlx5_vport *vport = &esw->vports[vport_num];
-	struct mlx5_vport_addr *addr;
-	struct l2addr_node *node;
+	bool is_uc = list_type == MLX5_NVPRT_LIST_TYPE_UC;
 	u8 (*mac_list)[ETH_ALEN];
+	struct l2addr_node *node;
+	struct vport_addr *addr;
 	struct hlist_head *hash;
 	struct hlist_node *tmp;
 	int size;
@@ -294,40 +704,41 @@ static void esw_update_vport_uc_list(struct mlx5_eswitch *esw,
 	int hi;
 	int i;
 
-	size = MLX5_MAX_UC_PER_VPORT(esw->dev);
+	size = is_uc ? MLX5_MAX_UC_PER_VPORT(esw->dev) :
+		       MLX5_MAX_MC_PER_VPORT(esw->dev);
 
 	mac_list = kcalloc(size, ETH_ALEN, GFP_KERNEL);
 	if (!mac_list)
 		return;
 
-	hash = vport->uc_list;
+	hash = is_uc ? vport->uc_list : vport->mc_list;
 
 	for_each_l2hash_node(node, tmp, hash, hi) {
-		addr = container_of(node, struct mlx5_vport_addr, node);
+		addr = container_of(node, struct vport_addr, node);
 		addr->action = MLX5_ACTION_DEL;
 	}
 
-	err = mlx5_query_nic_vport_mac_list(esw->dev, vport_num,
-					    MLX5_NVPRT_LIST_TYPE_UC,
+	err = mlx5_query_nic_vport_mac_list(esw->dev, vport_num, list_type,
 					    mac_list, &size);
 	if (err)
 		return;
-	esw_debug(esw->dev, "vport[%d] context update UC list size (%d)\n",
-		  vport_num, size);
+	esw_debug(esw->dev, "vport[%d] context update %s list size (%d)\n",
+		  vport_num, is_uc ? "UC" : "MC", size);
 
 	for (i = 0; i < size; i++) {
-		if (!is_valid_ether_addr(mac_list[i]))
+		if (is_uc && !is_valid_ether_addr(mac_list[i]))
+			continue;
+
+		if (!is_uc && !is_multicast_ether_addr(mac_list[i]))
 			continue;
 
-		addr = l2addr_hash_find(hash, mac_list[i],
-					struct mlx5_vport_addr);
+		addr = l2addr_hash_find(hash, mac_list[i], struct vport_addr);
 		if (addr) {
 			addr->action = MLX5_ACTION_NONE;
 			continue;
 		}
 
-		addr = l2addr_hash_add(hash, mac_list[i],
-				       struct mlx5_vport_addr,
+		addr = l2addr_hash_add(hash, mac_list[i], struct vport_addr,
 				       GFP_KERNEL);
 		if (!addr) {
 			esw_warn(esw->dev,
@@ -335,6 +746,7 @@ static void esw_update_vport_uc_list(struct mlx5_eswitch *esw,
 				 mac_list[i], vport_num);
 			continue;
 		}
+		addr->vport = vport_num;
 		addr->action = MLX5_ACTION_ADD;
 	}
 	kfree(mac_list);
@@ -345,32 +757,80 @@ static void esw_vport_change_handler(struct work_struct *work)
 	struct mlx5_vport *vport =
 		container_of(work, struct mlx5_vport, vport_change_handler);
 	struct mlx5_core_dev *dev = vport->dev;
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
 	u8 mac[ETH_ALEN];
 
 	mlx5_query_nic_vport_mac_address(dev, vport->vport, mac);
+	esw_debug(dev, "vport[%d] Context Changed: perm mac: %pM\n",
+		  vport->vport, mac);
+
+	if (vport->enabled_events & UC_ADDR_CHANGE) {
+		esw_update_vport_addr_list(esw, vport->vport,
+					   MLX5_NVPRT_LIST_TYPE_UC);
+		esw_apply_vport_addr_list(esw, vport->vport,
+					  MLX5_NVPRT_LIST_TYPE_UC);
+	}
 
-	if (!is_valid_ether_addr(mac))
-		goto out;
-
-	esw_update_vport_uc_list(dev->priv.eswitch, vport->vport);
-	esw_apply_vport_uc_list(dev->priv.eswitch, vport->vport);
+	if (vport->enabled_events & MC_ADDR_CHANGE) {
+		esw_update_vport_addr_list(esw, vport->vport,
+					   MLX5_NVPRT_LIST_TYPE_MC);
+		esw_apply_vport_addr_list(esw, vport->vport,
+					  MLX5_NVPRT_LIST_TYPE_MC);
+	}
 
-out:
+	esw_debug(esw->dev, "vport[%d] Context Changed: Done\n", vport->vport);
 	if (vport->enabled)
 		arm_vport_context_events_cmd(dev, vport->vport,
-					     UC_ADDR_CHANGE);
+					     vport->enabled_events);
 }
 
-static void esw_enable_vport(struct mlx5_eswitch *esw, int vport_num)
+static void esw_enable_vport(struct mlx5_eswitch *esw, int vport_num,
+			     int enable_events)
 {
 	struct mlx5_vport *vport = &esw->vports[vport_num];
 	unsigned long flags;
 
+	WARN_ON(vport->enabled);
+
+	esw_debug(esw->dev, "Enabling VPORT(%d)\n", vport_num);
+	mlx5_modify_vport_admin_state(esw->dev,
+				      MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+				      vport_num,
+				      MLX5_ESW_VPORT_ADMIN_STATE_AUTO);
+
+	/* Sync with current vport context */
+	vport->enabled_events = enable_events;
+	esw_vport_change_handler(&vport->vport_change_handler);
+
 	spin_lock_irqsave(&vport->lock, flags);
 	vport->enabled = true;
 	spin_unlock_irqrestore(&vport->lock, flags);
 
-	arm_vport_context_events_cmd(esw->dev, vport_num, UC_ADDR_CHANGE);
+	arm_vport_context_events_cmd(esw->dev, vport_num, enable_events);
+
+	esw->enabled_vports++;
+	esw_debug(esw->dev, "Enabled VPORT(%d)\n", vport_num);
+}
+
+static void esw_cleanup_vport(struct mlx5_eswitch *esw, u16 vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	struct l2addr_node *node;
+	struct vport_addr *addr;
+	struct hlist_node *tmp;
+	int hi;
+
+	for_each_l2hash_node(node, tmp, vport->uc_list, hi) {
+		addr = container_of(node, struct vport_addr, node);
+		addr->action = MLX5_ACTION_DEL;
+	}
+	esw_apply_vport_addr_list(esw, vport_num, MLX5_NVPRT_LIST_TYPE_UC);
+
+	for_each_l2hash_node(node, tmp, vport->mc_list, hi) {
+		addr = container_of(node, struct vport_addr, node);
+		addr->action = MLX5_ACTION_DEL;
+	}
+	esw_apply_vport_addr_list(esw, vport_num, MLX5_NVPRT_LIST_TYPE_MC);
 }
 
 static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num)
@@ -381,19 +841,82 @@ static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num)
 	if (!vport->enabled)
 		return;
 
+	esw_debug(esw->dev, "Disabling vport(%d)\n", vport_num);
 	/* Mark this vport as disabled to discard new events */
 	spin_lock_irqsave(&vport->lock, flags);
 	vport->enabled = false;
+	vport->enabled_events = 0;
 	spin_unlock_irqrestore(&vport->lock, flags);
 
+	mlx5_modify_vport_admin_state(esw->dev,
+				      MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+				      vport_num,
+				      MLX5_ESW_VPORT_ADMIN_STATE_DOWN);
 	/* Wait for current already scheduled events to complete */
 	flush_workqueue(esw->work_queue);
-
 	/* Disable events from this vport */
 	arm_vport_context_events_cmd(esw->dev, vport->vport, 0);
+	/* We don't assume VFs will cleanup after themselves */
+	esw_cleanup_vport(esw, vport_num);
+	esw->enabled_vports--;
 }
 
 /* Public E-Switch API */
+int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs)
+{
+	int err;
+	int i;
+
+	if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager) ||
+	    MLX5_CAP_GEN(esw->dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return 0;
+
+	if (!MLX5_CAP_GEN(esw->dev, eswitch_flow_table) ||
+	    !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) {
+		esw_warn(esw->dev, "E-Switch FDB is not supported, aborting ...\n");
+		return -ENOTSUPP;
+	}
+
+	esw_info(esw->dev, "E-Switch enable SRIOV: nvfs(%d)\n", nvfs);
+
+	esw_disable_vport(esw, 0);
+
+	err = esw_create_fdb_table(esw, nvfs + 1);
+	if (err)
+		goto abort;
+
+	for (i = 0; i <= nvfs; i++)
+		esw_enable_vport(esw, i, SRIOV_VPORT_EVENTS);
+
+	esw_info(esw->dev, "SRIOV enabled: active vports(%d)\n",
+		 esw->enabled_vports);
+	return 0;
+
+abort:
+	esw_enable_vport(esw, 0, UC_ADDR_CHANGE);
+	return err;
+}
+
+void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
+{
+	int i;
+
+	if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager) ||
+	    MLX5_CAP_GEN(esw->dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return;
+
+	esw_info(esw->dev, "disable SRIOV: active vports(%d)\n",
+		 esw->enabled_vports);
+
+	for (i = 0; i < esw->total_vports; i++)
+		esw_disable_vport(esw, i);
+
+	esw_destroy_fdb_table(esw);
+
+	/* VPORT 0 (PF) must be enabled back with non-sriov configuration */
+	esw_enable_vport(esw, 0, UC_ADDR_CHANGE);
+}
+
 int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 {
 	int l2_table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
@@ -439,7 +962,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 		goto abort;
 	}
 
-	esw->total_vports = total_vports;
 	for (vport_num = 0; vport_num < total_vports; vport_num++) {
 		struct mlx5_vport *vport = &esw->vports[vport_num];
 
@@ -450,9 +972,11 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 		spin_lock_init(&vport->lock);
 	}
 
-	dev->priv.eswitch = esw;
+	esw->total_vports = total_vports;
+	esw->enabled_vports = 0;
 
-	esw_enable_vport(esw, 0);
+	dev->priv.eswitch = esw;
+	esw_enable_vport(esw, 0, UC_ADDR_CHANGE);
 	/* VF Vports will be enabled when SRIOV is enabled */
 	return 0;
 abort:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 0c41f2657824..f222e336f34f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -86,10 +86,25 @@ struct l2addr_node {
 	kfree(ptr);                                         \
 })
 
+struct mlx5_flow_rule {
+	void             *ft;
+	u32              fi;
+	u8               match_criteria_enable;
+	u32              *match_criteria;
+	u32              *match_value;
+	u32              action;
+	u32              flow_tag;
+	bool             valid;
+	atomic_t         refcount;
+	struct mutex     mutex; /* protect flow rule updates */
+	struct list_head dest_list;
+};
+
 struct mlx5_vport {
 	struct mlx5_core_dev    *dev;
 	int                     vport;
 	struct hlist_head       uc_list[MLX5_L2_ADDR_HASH_SIZE];
+	struct hlist_head       mc_list[MLX5_L2_ADDR_HASH_SIZE];
 	struct work_struct      vport_change_handler;
 
 	/* This spinlock protects access to vport data, between
@@ -98,6 +113,7 @@ struct mlx5_vport {
 	 */
 	spinlock_t              lock; /* vport events sync */
 	bool                    enabled;
+	u16                     enabled_events;
 };
 
 struct mlx5_l2_table {
@@ -106,17 +122,26 @@ struct mlx5_l2_table {
 	unsigned long        *bitmap;
 };
 
+struct mlx5_eswitch_fdb {
+	void *fdb;
+};
+
 struct mlx5_eswitch {
 	struct mlx5_core_dev    *dev;
 	struct mlx5_l2_table    l2_table;
+	struct mlx5_eswitch_fdb fdb_table;
+	struct hlist_head       mc_table[MLX5_L2_ADDR_HASH_SIZE];
 	struct workqueue_struct *work_queue;
 	struct mlx5_vport       *vports;
 	int                     total_vports;
+	int                     enabled_vports;
 };
 
 /* E-Switch API */
 int mlx5_eswitch_init(struct mlx5_core_dev *dev);
 void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
 void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe);
+int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs);
+void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw);
 
 #endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 1649d5cf9e29..bee7da822dfe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -36,6 +36,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/if_link.h>
 
 #define DRIVER_NAME "mlx5_core"
 #define DRIVER_VERSION "3.0-1"
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index 19a43240e359..7b24386794f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -33,6 +33,9 @@
 #include <linux/pci.h>
 #include <linux/mlx5/driver.h>
 #include "mlx5_core.h"
+#ifdef CONFIG_MLX5_CORE_EN
+#include "eswitch.h"
+#endif
 
 static void enable_vfs(struct mlx5_core_dev *dev, int num_vfs)
 {
@@ -144,13 +147,15 @@ int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs)
 	mlx5_core_cleanup_vfs(dev);
 
 	if (!num_vfs) {
+#ifdef CONFIG_MLX5_CORE_EN
+		mlx5_eswitch_disable_sriov(dev->priv.eswitch);
+#endif
 		kfree(sriov->vfs_ctx);
 		sriov->vfs_ctx = NULL;
 		if (!pci_vfs_assigned(pdev))
 			pci_disable_sriov(pdev);
 		else
 			pr_info("unloading PF driver while leaving orphan VFs\n");
-
 		return 0;
 	}
 
@@ -161,6 +166,9 @@ int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs)
 	}
 
 	mlx5_core_init_vfs(dev, num_vfs);
+#ifdef CONFIG_MLX5_CORE_EN
+	mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs);
+#endif
 
 	return num_vfs;
 }
@@ -199,6 +207,10 @@ int mlx5_sriov_init(struct mlx5_core_dev *dev)
 	sriov->enabled_vfs = cur_vfs;
 
 	mlx5_core_init_vfs(dev, cur_vfs);
+#ifdef CONFIG_MLX5_CORE_EN
+	if (cur_vfs)
+		mlx5_eswitch_enable_sriov(dev->priv.eswitch, cur_vfs);
+#endif
 
 	enable_vfs(dev, cur_vfs);
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index bce9caed1eed..88eb4490a8b3 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1074,6 +1074,12 @@ enum {
 	VPORT_STATE_UP			= 0x1,
 };
 
+enum {
+	MLX5_ESW_VPORT_ADMIN_STATE_DOWN  = 0x0,
+	MLX5_ESW_VPORT_ADMIN_STATE_UP    = 0x1,
+	MLX5_ESW_VPORT_ADMIN_STATE_AUTO  = 0x2,
+};
+
 enum {
 	MLX5_L3_PROT_TYPE_IPV4		= 0,
 	MLX5_L3_PROT_TYPE_IPV6		= 1,
diff --git a/include/linux/mlx5/flow_table.h b/include/linux/mlx5/flow_table.h
index 5f922c6d4fc2..0f2a15cf3317 100644
--- a/include/linux/mlx5/flow_table.h
+++ b/include/linux/mlx5/flow_table.h
@@ -41,6 +41,15 @@ struct mlx5_flow_table_group {
 	u32	match_criteria[MLX5_ST_SZ_DW(fte_match_param)];
 };
 
+struct mlx5_flow_destination {
+	enum mlx5_flow_destination_type	type;
+	union {
+		u32			tir_num;
+		void			*ft;
+		u32			vport_num;
+	};
+};
+
 void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type,
 			     u16 num_groups,
 			     struct mlx5_flow_table_group *group);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ae7c08adba4a..a81b008738fd 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -827,9 +827,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_69[0x220];
 };
 
-enum {
-	MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_FLOW_TABLE_  = 0x1,
-	MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_TIR          = 0x2,
+enum mlx5_flow_destination_type {
+	MLX5_FLOW_DESTINATION_TYPE_VPORT        = 0x0,
+	MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE   = 0x1,
+	MLX5_FLOW_DESTINATION_TYPE_TIR          = 0x2,
 };
 
 struct mlx5_ifc_dest_format_struct_bits {
-- 
cgit v1.2.3-71-gd317


From d6666753c6e85834f1669c7b831cc2b7fc9e4390 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 1 Dec 2015 18:03:22 +0200
Subject: net/mlx5: E-Switch, Introduce HCA cap and E-Switch vport context

E-Switch vport context is unlike NIC vport context, managed by the
E-Switch manager or vport_group_manager and not by the NIC(VF) driver.

The E-Switch manager can access (read/modify) any of its vports
E-Switch context.

Currently E-Switch vport context includes only clietnt and server
vlan insertion and striping data (for later support of VST mode).

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c | 11 ++++
 include/linux/mlx5/device.h                  |  9 +++
 include/linux/mlx5/mlx5_ifc.h                | 90 ++++++++++++++++++++++++++++
 3 files changed, 110 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index bf6e3dfcef51..1c9f9a54a873 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -173,6 +173,17 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 			return err;
 	}
 
+	if (MLX5_CAP_GEN(dev, vport_group_manager)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH,
+					 HCA_CAP_OPMOD_GET_CUR);
+		if (err)
+			return err;
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH,
+					 HCA_CAP_OPMOD_GET_MAX);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 88eb4490a8b3..7d3a85faefb7 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1145,6 +1145,7 @@ enum mlx5_cap_type {
 	MLX5_CAP_EOIB_OFFLOADS,
 	MLX5_CAP_FLOW_TABLE,
 	MLX5_CAP_ESWITCH_FLOW_TABLE,
+	MLX5_CAP_ESWITCH,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
 };
@@ -1196,6 +1197,14 @@ enum mlx5_cap_type {
 #define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \
 	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap)
 
+#define MLX5_CAP_ESW(mdev, cap) \
+	MLX5_GET(e_switch_cap, \
+		 mdev->hca_caps_cur[MLX5_CAP_ESWITCH], cap)
+
+#define MLX5_CAP_ESW_MAX(mdev, cap) \
+	MLX5_GET(e_switch_cap, \
+		 mdev->hca_caps_max[MLX5_CAP_ESWITCH], cap)
+
 #define MLX5_CAP_ODP(mdev, cap)\
 	MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap)
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a81b008738fd..f5d94495758a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -459,6 +459,17 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits {
 	u8      reserved_1[0x7800];
 };
 
+struct mlx5_ifc_e_switch_cap_bits {
+	u8         vport_svlan_strip[0x1];
+	u8         vport_cvlan_strip[0x1];
+	u8         vport_svlan_insert[0x1];
+	u8         vport_cvlan_insert_if_not_exist[0x1];
+	u8         vport_cvlan_insert_overwrite[0x1];
+	u8         reserved_0[0x1b];
+
+	u8         reserved_1[0x7e0];
+};
+
 struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         csum_cap[0x1];
 	u8         vlan_cap[0x1];
@@ -1860,6 +1871,7 @@ union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps;
 	struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap;
 	struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap;
+	struct mlx5_ifc_e_switch_cap_bits e_switch_cap;
 	u8         reserved_0[0x8000];
 };
 
@@ -2305,6 +2317,26 @@ struct mlx5_ifc_hca_vport_context_bits {
 	u8         reserved_6[0xca0];
 };
 
+struct mlx5_ifc_esw_vport_context_bits {
+	u8         reserved_0[0x3];
+	u8         vport_svlan_strip[0x1];
+	u8         vport_cvlan_strip[0x1];
+	u8         vport_svlan_insert[0x1];
+	u8         vport_cvlan_insert[0x2];
+	u8         reserved_1[0x18];
+
+	u8         reserved_2[0x20];
+
+	u8         svlan_cfi[0x1];
+	u8         svlan_pcp[0x3];
+	u8         svlan_id[0xc];
+	u8         cvlan_cfi[0x1];
+	u8         cvlan_pcp[0x3];
+	u8         cvlan_id[0xc];
+
+	u8         reserved_3[0x7a0];
+};
+
 enum {
 	MLX5_EQC_STATUS_OK                = 0x0,
 	MLX5_EQC_STATUS_EQ_WRITE_FAILURE  = 0xa,
@@ -3743,6 +3775,64 @@ struct mlx5_ifc_query_flow_group_in_bits {
 	u8         reserved_5[0x120];
 };
 
+struct mlx5_ifc_query_esw_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_esw_vport_context_bits esw_vport_context;
+};
+
+struct mlx5_ifc_query_esw_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_modify_esw_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_esw_vport_context_fields_select_bits {
+	u8         reserved[0x1c];
+	u8         vport_cvlan_insert[0x1];
+	u8         vport_svlan_insert[0x1];
+	u8         vport_cvlan_strip[0x1];
+	u8         vport_svlan_strip[0x1];
+};
+
+struct mlx5_ifc_modify_esw_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	struct mlx5_ifc_esw_vport_context_fields_select_bits field_select;
+
+	struct mlx5_ifc_esw_vport_context_bits esw_vport_context;
+};
+
 struct mlx5_ifc_query_eq_out_bits {
 	u8         status[0x8];
 	u8         reserved_0[0x18];
-- 
cgit v1.2.3-71-gd317


From 2d1e0254ef8310e4f0756130a7ffc007ad1d58df Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 1 Dec 2015 14:55:21 +0000
Subject: pci_ids: add Netronome Systems vendor

Add PCI vendor id for Netronome Systems.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Rolf Neugebauer <rolf.neugebauer@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d9ba49cedc5d..1acbefc4bbda 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2495,6 +2495,8 @@
 #define PCI_DEVICE_ID_KORENIX_JETCARDF2	0x1700
 #define PCI_DEVICE_ID_KORENIX_JETCARDF3	0x17ff
 
+#define PCI_VENDOR_ID_NETRONOME		0x19ee
+
 #define PCI_VENDOR_ID_QMI		0x1a32
 
 #define PCI_VENDOR_ID_AZWAVE		0x1a3b
-- 
cgit v1.2.3-71-gd317


From 80a19e338d458abb5a700df3fd00795c51361f06 Mon Sep 17 00:00:00 2001
From: Asias He <asias@redhat.com>
Date: Wed, 2 Dec 2015 14:44:00 +0800
Subject: VSOCK: Introduce virtio-vsock-common.ko

This module contains the common code and header files for the following
virtio-vsock and virtio-vhost kernel modules.

Signed-off-by: Asias He <asias@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_vsock.h            |  209 +++++
 include/uapi/linux/virtio_ids.h         |    1 +
 include/uapi/linux/virtio_vsock.h       |   89 +++
 net/vmw_vsock/virtio_transport_common.c | 1272 +++++++++++++++++++++++++++++++
 4 files changed, 1571 insertions(+)
 create mode 100644 include/linux/virtio_vsock.h
 create mode 100644 include/uapi/linux/virtio_vsock.h
 create mode 100644 net/vmw_vsock/virtio_transport_common.c

(limited to 'include/linux')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
new file mode 100644
index 000000000000..a5f3ecc038f7
--- /dev/null
+++ b/include/linux/virtio_vsock.h
@@ -0,0 +1,209 @@
+/*
+ * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
+ * anyone can use the definitions to implement compatible drivers/servers:
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) Red Hat, Inc., 2013-2015
+ * Copyright (C) Asias He <asias@redhat.com>, 2013
+ * Copyright (C) Stefan Hajnoczi <stefanha@redhat.com>, 2015
+ */
+
+#ifndef _LINUX_VIRTIO_VSOCK_H
+#define _LINUX_VIRTIO_VSOCK_H
+
+#include <uapi/linux/virtio_vsock.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE	128
+#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE		(1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE	(1024 * 256)
+#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
+#define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
+#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE		(1024 * 1024 * 16)
+#define VIRTIO_VSOCK_MAX_DGRAM_SIZE		(1024 * 64)
+
+struct vsock_transport_recv_notify_data;
+struct vsock_transport_send_notify_data;
+struct sockaddr_vm;
+struct vsock_sock;
+
+enum {
+	VSOCK_VQ_CTRL	= 0,
+	VSOCK_VQ_RX	= 1, /* for host to guest data */
+	VSOCK_VQ_TX	= 2, /* for guest to host data */
+	VSOCK_VQ_MAX	= 3,
+};
+
+/* virtio transport socket state */
+struct virtio_transport {
+	struct virtio_transport_pkt_ops	*ops;
+	struct vsock_sock *vsk;
+
+	u32 buf_size;
+	u32 buf_size_min;
+	u32 buf_size_max;
+
+	struct mutex tx_lock;
+	struct mutex rx_lock;
+
+	struct list_head rx_queue;
+	u32 rx_bytes;
+
+	/* Protected by trans->tx_lock */
+	u32 tx_cnt;
+	u32 buf_alloc;
+	u32 peer_fwd_cnt;
+	u32 peer_buf_alloc;
+	/* Protected by trans->rx_lock */
+	u32 fwd_cnt;
+
+	/* Protected by sk_lock */
+	u16 dgram_id;
+	struct list_head incomplete_dgrams; /* dgram fragments */
+};
+
+struct virtio_vsock_pkt {
+	struct virtio_vsock_hdr	hdr;
+	struct virtio_transport	*trans;
+	struct work_struct work;
+	struct list_head list;
+	void *buf;
+	u32 len;
+	u32 off;
+};
+
+struct virtio_vsock_pkt_info {
+	u32 remote_cid, remote_port;
+	struct msghdr *msg;
+	u32 pkt_len;
+	u16 type;
+	u16 op;
+	u32 flags;
+	u16 dgram_id;
+	u16 dgram_len;
+};
+
+struct virtio_transport_pkt_ops {
+	int (*send_pkt)(struct vsock_sock *vsk,
+			struct virtio_vsock_pkt_info *info);
+};
+
+void virtio_vsock_dumppkt(const char *func,
+			  const struct virtio_vsock_pkt *pkt);
+
+struct sock *
+virtio_transport_get_pending(struct sock *listener,
+			     struct virtio_vsock_pkt *pkt);
+struct virtio_vsock_pkt *
+virtio_transport_alloc_pkt(struct vsock_sock *vsk,
+			   struct virtio_vsock_pkt_info *info,
+			   size_t len,
+			   u32 src_cid,
+			   u32 src_port,
+			   u32 dst_cid,
+			   u32 dst_port);
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len,
+				int type);
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+			       struct msghdr *msg,
+			       size_t len, int flags);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+				 struct vsock_sock *psk);
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk);
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk);
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk);
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val);
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val);
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val);
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+				size_t target,
+				bool *data_ready_now);
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+				 size_t target,
+				 bool *space_available_now);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+	size_t target, ssize_t copied, bool data_read,
+	struct vsock_transport_recv_notify_data *data);
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data);
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+	ssize_t written, struct vsock_transport_send_notify_data *data);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
+bool virtio_transport_stream_allow(u32 cid, u32 port);
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+				struct sockaddr_vm *addr);
+bool virtio_transport_dgram_allow(u32 cid, u32 port);
+
+int virtio_transport_connect(struct vsock_sock *vsk);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode);
+
+void virtio_transport_release(struct vsock_sock *vsk);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len);
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+			       struct sockaddr_vm *remote_addr,
+			       struct msghdr *msg,
+			       size_t len);
+
+void virtio_transport_destruct(struct vsock_sock *vsk);
+
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt);
+u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 wanted);
+void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit);
+#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 77925f587b15..16dcf5d06cd7 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -39,6 +39,7 @@
 #define VIRTIO_ID_9P		9 /* 9p virtio console */
 #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
 #define VIRTIO_ID_CAIF	       12 /* Virtio caif */
+#define VIRTIO_ID_VSOCK        13 /* virtio vsock transport */
 #define VIRTIO_ID_GPU          16 /* virtio GPU */
 #define VIRTIO_ID_INPUT        18 /* virtio input */
 
diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
new file mode 100644
index 000000000000..8cf9b5682628
--- /dev/null
+++ b/include/uapi/linux/virtio_vsock.h
@@ -0,0 +1,89 @@
+/*
+ * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
+ * anyone can use the definitions to implement compatible drivers/servers:
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) Red Hat, Inc., 2013-2015
+ * Copyright (C) Asias He <asias@redhat.com>, 2013
+ * Copyright (C) Stefan Hajnoczi <stefanha@redhat.com>, 2015
+ */
+
+#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H
+#define _UAPI_LINUX_VIRTIO_VOSCK_H
+
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+
+struct virtio_vsock_config {
+	__le32 guest_cid;
+	__le32 max_virtqueue_pairs;
+};
+
+struct virtio_vsock_hdr {
+	__le32	src_cid;
+	__le32	src_port;
+	__le32	dst_cid;
+	__le32	dst_port;
+	__le32	len;
+	__le16	type;		/* enum virtio_vsock_type */
+	__le16	op;		/* enum virtio_vsock_op */
+	__le32	flags;
+	__le32	buf_alloc;
+	__le32	fwd_cnt;
+};
+
+enum virtio_vsock_type {
+	VIRTIO_VSOCK_TYPE_STREAM = 1,
+	VIRTIO_VSOCK_TYPE_DGRAM = 2,
+};
+
+enum virtio_vsock_op {
+	VIRTIO_VSOCK_OP_INVALID = 0,
+
+	/* Connect operations */
+	VIRTIO_VSOCK_OP_REQUEST = 1,
+	VIRTIO_VSOCK_OP_RESPONSE = 2,
+	VIRTIO_VSOCK_OP_ACK = 3,
+	VIRTIO_VSOCK_OP_RST = 4,
+	VIRTIO_VSOCK_OP_SHUTDOWN = 5,
+
+	/* To send payload */
+	VIRTIO_VSOCK_OP_RW = 6,
+
+	/* Tell the peer our credit info */
+	VIRTIO_VSOCK_OP_CREDIT_UPDATE = 7,
+	/* Request the peer to send the credit info to us */
+	VIRTIO_VSOCK_OP_CREDIT_REQUEST = 8,
+};
+
+/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */
+enum virtio_vsock_shutdown {
+	VIRTIO_VSOCK_SHUTDOWN_RCV = 1,
+	VIRTIO_VSOCK_SHUTDOWN_SEND = 2,
+};
+
+#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
new file mode 100644
index 000000000000..28f790da6f15
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -0,0 +1,1272 @@
+/*
+ * common code for virtio vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ *         Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+#include <linux/random.h>
+#include <linux/cryptohash.h>
+
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define COOKIEBITS 24
+#define COOKIEMASK (((u32)1 << COOKIEBITS) - 1)
+#define VSOCK_TIMEOUT_INIT 4
+
+#define SHA_MESSAGE_WORDS 16
+#define SHA_VSOCK_WORDS 5
+
+static u32 vsockcookie_secret[2][SHA_MESSAGE_WORDS - SHA_VSOCK_WORDS +
+				 SHA_DIGEST_WORDS];
+
+static DEFINE_PER_CPU(__u32[SHA_MESSAGE_WORDS + SHA_DIGEST_WORDS +
+		      SHA_WORKSPACE_WORDS], vsock_cookie_scratch);
+
+static u32 cookie_hash(u32 saddr, u32 daddr, u16 sport, u16 dport,
+		       u32 count, int c)
+{
+	__u32 *tmp = this_cpu_ptr(vsock_cookie_scratch);
+
+	memcpy(tmp + SHA_VSOCK_WORDS, vsockcookie_secret[c],
+	       sizeof(vsockcookie_secret[c]));
+	tmp[0] = saddr;
+	tmp[1] = daddr;
+	tmp[2] = sport;
+	tmp[3] = dport;
+	tmp[4] = count;
+	sha_transform(tmp + SHA_MESSAGE_WORDS, (__u8 *)tmp,
+		      tmp + SHA_MESSAGE_WORDS + SHA_DIGEST_WORDS);
+
+	return tmp[17];
+}
+
+static u32
+virtio_vsock_secure_cookie(u32 saddr, u32 daddr, u32 sport, u32 dport,
+			   u32 count)
+{
+	u32 h1, h2;
+
+	h1 = cookie_hash(saddr, daddr, sport, dport, 0, 0);
+	h2 = cookie_hash(saddr, daddr, sport, dport, count, 1);
+
+	return h1 + (count << COOKIEBITS) + (h2 & COOKIEMASK);
+}
+
+static u32
+virtio_vsock_check_cookie(u32 saddr, u32 daddr, u32 sport, u32 dport,
+			  u32 count, u32 cookie, u32 maxdiff)
+{
+	u32 diff;
+	u32 ret;
+
+	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0);
+
+	diff = (count - (cookie >> COOKIEBITS)) & ((u32)-1 >> COOKIEBITS);
+	pr_debug("%s: diff=%x\n", __func__, diff);
+	if (diff >= maxdiff)
+		return (u32)-1;
+
+	ret = (cookie -
+		cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+		& COOKIEMASK;
+	pr_debug("%s: ret=%x\n", __func__, diff);
+
+	return ret;
+}
+
+void virtio_vsock_dumppkt(const char *func,  const struct virtio_vsock_pkt *pkt)
+{
+	pr_debug("%s: pkt=%p, op=%d, len=%d, %d:%d---%d:%d, len=%d\n",
+		 func, pkt,
+		 le16_to_cpu(pkt->hdr.op),
+		 le32_to_cpu(pkt->hdr.len),
+		 le32_to_cpu(pkt->hdr.src_cid),
+		 le32_to_cpu(pkt->hdr.src_port),
+		 le32_to_cpu(pkt->hdr.dst_cid),
+		 le32_to_cpu(pkt->hdr.dst_port),
+		 pkt->len);
+}
+EXPORT_SYMBOL_GPL(virtio_vsock_dumppkt);
+
+struct virtio_vsock_pkt *
+virtio_transport_alloc_pkt(struct vsock_sock *vsk,
+			   struct virtio_vsock_pkt_info *info,
+			   size_t len,
+			   u32 src_cid,
+			   u32 src_port,
+			   u32 dst_cid,
+			   u32 dst_port)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt *pkt;
+	int err;
+
+	BUG_ON(!trans);
+
+	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+	if (!pkt)
+		return NULL;
+
+	pkt->hdr.type		= cpu_to_le16(info->type);
+	pkt->hdr.op		= cpu_to_le16(info->op);
+	pkt->hdr.src_cid	= cpu_to_le32(src_cid);
+	pkt->hdr.src_port	= cpu_to_le32(src_port);
+	pkt->hdr.dst_cid	= cpu_to_le32(dst_cid);
+	pkt->hdr.dst_port	= cpu_to_le32(dst_port);
+	pkt->hdr.flags		= cpu_to_le32(info->flags);
+	pkt->len		= len;
+	pkt->trans		= trans;
+	if (info->type == VIRTIO_VSOCK_TYPE_DGRAM)
+		pkt->hdr.len	= cpu_to_le32(len + (info->dgram_len << 16));
+	else if (info->type == VIRTIO_VSOCK_TYPE_STREAM)
+		pkt->hdr.len	= cpu_to_le32(len);
+
+	if (info->msg && len > 0) {
+		pkt->buf = kmalloc(len, GFP_KERNEL);
+		if (!pkt->buf)
+			goto out_pkt;
+		err = memcpy_from_msg(pkt->buf, info->msg, len);
+		if (err)
+			goto out;
+	}
+
+	return pkt;
+
+out:
+	kfree(pkt->buf);
+out_pkt:
+	kfree(pkt);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
+
+struct sock *
+virtio_transport_get_pending(struct sock *listener,
+			     struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vlistener;
+	struct vsock_sock *vpending;
+	struct sockaddr_vm src;
+	struct sockaddr_vm dst;
+	struct sock *pending;
+
+	vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port));
+	vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), le32_to_cpu(pkt->hdr.dst_port));
+
+	vlistener = vsock_sk(listener);
+	list_for_each_entry(vpending, &vlistener->pending_links,
+			    pending_links) {
+		if (vsock_addr_equals_addr(&src, &vpending->remote_addr) &&
+		    vsock_addr_equals_addr(&dst, &vpending->local_addr)) {
+			pending = sk_vsock(vpending);
+			sock_hold(pending);
+			return pending;
+		}
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_pending);
+
+static void virtio_transport_inc_rx_pkt(struct virtio_vsock_pkt *pkt)
+{
+	pkt->trans->rx_bytes += pkt->len;
+}
+
+static void virtio_transport_dec_rx_pkt(struct virtio_vsock_pkt *pkt)
+{
+	pkt->trans->rx_bytes -= pkt->len;
+	pkt->trans->fwd_cnt += pkt->len;
+}
+
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt)
+{
+	mutex_lock(&pkt->trans->tx_lock);
+	pkt->hdr.fwd_cnt = cpu_to_le32(pkt->trans->fwd_cnt);
+	pkt->hdr.buf_alloc = cpu_to_le32(pkt->trans->buf_alloc);
+	mutex_unlock(&pkt->trans->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
+
+void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt)
+{
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dec_tx_pkt);
+
+u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 credit)
+{
+	u32 ret;
+
+	mutex_lock(&trans->tx_lock);
+	ret = trans->peer_buf_alloc - (trans->tx_cnt - trans->peer_fwd_cnt);
+	if (ret > credit)
+		ret = credit;
+	trans->tx_cnt += ret;
+	mutex_unlock(&trans->tx_lock);
+
+	pr_debug("%s: ret=%d, buf_alloc=%d, peer_buf_alloc=%d,"
+		 "tx_cnt=%d, fwd_cnt=%d, peer_fwd_cnt=%d\n", __func__,
+		 ret, trans->buf_alloc, trans->peer_buf_alloc,
+		 trans->tx_cnt, trans->fwd_cnt, trans->peer_fwd_cnt);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_credit);
+
+void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit)
+{
+	mutex_lock(&trans->tx_lock);
+	trans->tx_cnt -= credit;
+	mutex_unlock(&trans->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
+
+static int virtio_transport_send_credit_update(struct vsock_sock *vsk, int type, struct virtio_vsock_hdr *hdr)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
+		.type = type,
+	};
+
+	if (hdr && type == VIRTIO_VSOCK_TYPE_DGRAM) {
+		info.remote_cid = le32_to_cpu(hdr->src_cid);
+		info.remote_port = le32_to_cpu(hdr->src_port);
+	}
+
+	pr_debug("%s: sk=%p send_credit_update\n", __func__, vsk);
+	return trans->ops->send_pkt(vsk, &info);
+}
+
+static int virtio_transport_send_credit_request(struct vsock_sock *vsk, int type)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_CREDIT_REQUEST,
+		.type = type,
+	};
+
+	pr_debug("%s: sk=%p send_credit_request\n", __func__, vsk);
+	return trans->ops->send_pkt(vsk, &info);
+}
+
+static ssize_t
+virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
+				   struct msghdr *msg,
+				   size_t len)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt *pkt;
+	size_t bytes, total = 0;
+	int err = -EFAULT;
+
+	mutex_lock(&trans->rx_lock);
+	while (total < len && trans->rx_bytes > 0  &&
+			!list_empty(&trans->rx_queue)) {
+		pkt = list_first_entry(&trans->rx_queue,
+				       struct virtio_vsock_pkt, list);
+
+		bytes = len - total;
+		if (bytes > pkt->len - pkt->off)
+			bytes = pkt->len - pkt->off;
+
+		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
+		if (err)
+			goto out;
+		total += bytes;
+		pkt->off += bytes;
+		if (pkt->off == pkt->len) {
+			virtio_transport_dec_rx_pkt(pkt);
+			list_del(&pkt->list);
+			virtio_transport_free_pkt(pkt);
+		}
+	}
+	mutex_unlock(&trans->rx_lock);
+
+	/* Send a credit pkt to peer */
+	virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
+					    NULL);
+
+	return total;
+
+out:
+	mutex_unlock(&trans->rx_lock);
+	if (total)
+		err = total;
+	return err;
+}
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len, int flags)
+{
+	if (flags & MSG_PEEK)
+		return -EOPNOTSUPP;
+
+	return virtio_transport_stream_do_dequeue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
+
+struct dgram_skb {
+	struct list_head list;
+	struct sk_buff *skb;
+	u16 id;
+};
+
+static struct dgram_skb *dgram_id_to_skb(struct virtio_transport *trans,
+					 u16 id)
+{
+	struct dgram_skb *dgram_skb;
+
+	list_for_each_entry(dgram_skb, &trans->incomplete_dgrams, list) {
+		if (dgram_skb->id == id)
+			return dgram_skb;
+	}
+
+	return NULL;
+}
+
+static void
+virtio_transport_recv_dgram(struct sock *sk,
+			    struct virtio_vsock_pkt *pkt)
+{
+	struct sk_buff *skb = NULL;
+	struct vsock_sock *vsk;
+	struct virtio_transport *trans;
+	size_t size;
+	u16 dgram_id, pkt_off, dgram_len, pkt_len;
+	u32 flags, len;
+	struct dgram_skb *dgram_skb;
+
+	vsk = vsock_sk(sk);
+	trans = vsk->trans;
+
+	/* len:   dgram_len | pkt_len */
+	len = le32_to_cpu(pkt->hdr.len);
+	dgram_len = len >> 16;
+	pkt_len = len & 0xFFFF;
+
+	/* flags: dgram_id  | pkt_off */
+	flags = le32_to_cpu(pkt->hdr.flags);
+	dgram_id = flags >> 16;
+	pkt_off = flags & 0xFFFF;
+
+	pr_debug("%s: dgram_len=%d, pkt_len=%d, id=%d, off=%d\n", __func__,
+		 dgram_len, pkt_len, dgram_id, pkt_off);
+
+	dgram_skb = dgram_id_to_skb(trans, dgram_id);
+	if (dgram_skb) {
+		/* This pkt is for a existing dgram */
+		skb = dgram_skb->skb;
+		pr_debug("%s:found skb\n", __func__);
+	}
+
+	/* Packet payload must be within datagram bounds */
+	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
+		goto drop;
+	if (pkt_len > dgram_len)
+		goto drop;
+	if (pkt_off > dgram_len)
+		goto drop;
+	if (dgram_len - pkt_off < pkt_len)
+		goto drop;
+
+	if (!skb) {
+		/* This pkt is for a new dgram */
+		pr_debug("%s:create skb\n", __func__);
+
+		size = sizeof(pkt->hdr) + dgram_len;
+		/* Attach the packet to the socket's receive queue as an sk_buff. */
+		dgram_skb = kzalloc(sizeof(struct dgram_skb), GFP_ATOMIC);
+		if (!dgram_skb)
+			goto drop;
+
+		skb = alloc_skb(size, GFP_ATOMIC);
+		if (!skb) {
+			kfree(dgram_skb);
+			dgram_skb = NULL;
+			goto drop;
+		}
+		dgram_skb->id = dgram_id;
+		dgram_skb->skb = skb;
+		list_add_tail(&dgram_skb->list, &trans->incomplete_dgrams);
+
+		/* sk_receive_skb() will do a sock_put(), so hold here. */
+		sock_hold(sk);
+		skb_put(skb, size);
+		memcpy(skb->data, &pkt->hdr, sizeof(pkt->hdr));
+	}
+
+	memcpy(skb->data + sizeof(pkt->hdr) + pkt_off, pkt->buf, pkt_len);
+
+	pr_debug("%s:C, off=%d, pkt_len=%d, dgram_len=%d\n", __func__,
+		 pkt_off, pkt_len, dgram_len);
+
+	/* We are done with this dgram */
+	if (pkt_off + pkt_len == dgram_len) {
+		pr_debug("%s:dgram_id=%d is done\n", __func__, dgram_id);
+		list_del(&dgram_skb->list);
+		kfree(dgram_skb);
+		sk_receive_skb(sk, skb, 0);
+	}
+	virtio_transport_free_pkt(pkt);
+	return;
+
+drop:
+	if (dgram_skb) {
+		list_del(&dgram_skb->list);
+		kfree(dgram_skb);
+		kfree_skb(skb);
+		sock_put(sk);
+	}
+	virtio_transport_free_pkt(pkt);
+}
+
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+			       struct msghdr *msg,
+			       size_t len, int flags)
+{
+	struct virtio_vsock_hdr *hdr;
+	struct sk_buff *skb;
+	int noblock;
+	int err;
+	int dgram_len;
+
+	noblock = flags & MSG_DONTWAIT;
+
+	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
+		return -EOPNOTSUPP;
+
+	/* Retrieve the head sk_buff from the socket's receive queue. */
+	err = 0;
+	skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err);
+	if (err)
+		return err;
+	if (!skb)
+		return -EAGAIN;
+
+	hdr = (struct virtio_vsock_hdr *)skb->data;
+	if (!hdr)
+		goto out;
+
+	dgram_len = le32_to_cpu(hdr->len) >> 16;
+	/* Place the datagram payload in the user's iovec. */
+	err = skb_copy_datagram_msg(skb, sizeof(*hdr), msg, dgram_len);
+	if (err)
+		goto out;
+
+	if (msg->msg_name) {
+		/* Provide the address of the sender. */
+		DECLARE_SOCKADDR(struct sockaddr_vm *, vm_addr, msg->msg_name);
+		vsock_addr_init(vm_addr, le32_to_cpu(hdr->src_cid), le32_to_cpu(hdr->src_port));
+		msg->msg_namelen = sizeof(*vm_addr);
+	}
+	err = dgram_len;
+
+	/* Send a credit pkt to peer */
+	virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_DGRAM, hdr);
+
+	pr_debug("%s:done, recved =%d\n", __func__, dgram_len);
+out:
+	skb_free_datagram(&vsk->sk, skb);
+	return err;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
+{
+	struct virtio_transport *trans = vsk->trans;
+	s64 bytes;
+
+	mutex_lock(&trans->rx_lock);
+	bytes = trans->rx_bytes;
+	mutex_unlock(&trans->rx_lock);
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
+
+static s64 virtio_transport_has_space(struct vsock_sock *vsk)
+{
+	struct virtio_transport *trans = vsk->trans;
+	s64 bytes;
+
+	bytes = trans->peer_buf_alloc - (trans->tx_cnt - trans->peer_fwd_cnt);
+	if (bytes < 0)
+		bytes = 0;
+
+	return bytes;
+}
+
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk)
+{
+	struct virtio_transport *trans = vsk->trans;
+	s64 bytes;
+
+	mutex_lock(&trans->tx_lock);
+	bytes = virtio_transport_has_space(vsk);
+	mutex_unlock(&trans->tx_lock);
+
+	pr_debug("%s: bytes=%lld\n", __func__, bytes);
+
+	return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+				    struct vsock_sock *psk)
+{
+	struct virtio_transport *trans;
+
+	trans = kzalloc(sizeof(*trans), GFP_KERNEL);
+	if (!trans)
+		return -ENOMEM;
+
+	vsk->trans = trans;
+	trans->vsk = vsk;
+	if (psk) {
+		struct virtio_transport *ptrans = psk->trans;
+		trans->buf_size	= ptrans->buf_size;
+		trans->buf_size_min = ptrans->buf_size_min;
+		trans->buf_size_max = ptrans->buf_size_max;
+		trans->peer_buf_alloc = ptrans->peer_buf_alloc;
+	} else {
+		trans->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
+		trans->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
+		trans->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
+	}
+
+	trans->buf_alloc = trans->buf_size;
+
+	pr_debug("%s: trans->buf_alloc=%d\n", __func__, trans->buf_alloc);
+
+	mutex_init(&trans->rx_lock);
+	mutex_init(&trans->tx_lock);
+	INIT_LIST_HEAD(&trans->rx_queue);
+	INIT_LIST_HEAD(&trans->incomplete_dgrams);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
+
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_transport *trans = vsk->trans;
+
+	return trans->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
+
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_transport *trans = vsk->trans;
+
+	return trans->buf_size_min;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
+
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
+{
+	struct virtio_transport *trans = vsk->trans;
+
+	return trans->buf_size_max;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
+
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_transport *trans = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val < trans->buf_size_min)
+		trans->buf_size_min = val;
+	if (val > trans->buf_size_max)
+		trans->buf_size_max = val;
+	trans->buf_size = val;
+	trans->buf_alloc = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
+
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_transport *trans = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val > trans->buf_size)
+		trans->buf_size = val;
+	trans->buf_size_min = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
+
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+	struct virtio_transport *trans = vsk->trans;
+
+	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+	if (val < trans->buf_size)
+		trans->buf_size = val;
+	trans->buf_size_max = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+				size_t target,
+				bool *data_ready_now)
+{
+	if (vsock_stream_has_data(vsk))
+		*data_ready_now = true;
+	else
+		*data_ready_now = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in);
+
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+				 size_t target,
+				 bool *space_avail_now)
+{
+	s64 free_space;
+
+	free_space = vsock_stream_has_space(vsk);
+	if (free_space > 0)
+		*space_avail_now = true;
+	else if (free_space == 0)
+		*space_avail_now = false;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init);
+
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block);
+
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+	size_t target, struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue);
+
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+	size_t target, ssize_t copied, bool data_read,
+	struct vsock_transport_recv_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue);
+
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init);
+
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block);
+
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+	struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue);
+
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+	ssize_t written, struct vsock_transport_send_notify_data *data)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
+{
+	struct virtio_transport *trans = vsk->trans;
+
+	return trans->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
+
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
+
+bool virtio_transport_stream_allow(u32 cid, u32 port)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
+
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+				struct sockaddr_vm *addr)
+{
+	return vsock_bind_dgram_generic(vsk, addr);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
+
+bool virtio_transport_dgram_allow(u32 cid, u32 port)
+{
+	return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
+
+int virtio_transport_connect(struct vsock_sock *vsk)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_REQUEST,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+	};
+
+	pr_debug("%s: vsk=%p send_request\n", __func__, vsk);
+	return trans->ops->send_pkt(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_connect);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_SHUTDOWN,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.flags = (mode & RCV_SHUTDOWN ?
+			  VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
+			 (mode & SEND_SHUTDOWN ?
+			  VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+	};
+
+	pr_debug("%s: vsk=%p: send_shutdown\n", __func__, vsk);
+	return trans->ops->send_pkt(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_shutdown);
+
+void virtio_transport_release(struct vsock_sock *vsk)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct sock *sk = &vsk->sk;
+	struct dgram_skb *dgram_skb;
+	struct dgram_skb *dgram_skb_tmp;
+
+	pr_debug("%s: vsk=%p\n", __func__, vsk);
+
+	/* Tell other side to terminate connection */
+	if (sk->sk_type == SOCK_STREAM && sk->sk_state == SS_CONNECTED) {
+		virtio_transport_shutdown(vsk, SHUTDOWN_MASK);
+	}
+
+	/* Free incomplete dgrams */
+	lock_sock(sk);
+	list_for_each_entry_safe(dgram_skb, dgram_skb_tmp,
+				 &trans->incomplete_dgrams, list) {
+		list_del(&dgram_skb->list);
+		kfree_skb(dgram_skb->skb);
+		kfree(dgram_skb);
+		sock_put(sk); /* held in virtio_transport_recv_dgram() */
+	}
+	release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_release);
+
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+			       struct sockaddr_vm *remote_addr,
+			       struct msghdr *msg,
+			       size_t dgram_len)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RW,
+		.type = VIRTIO_VSOCK_TYPE_DGRAM,
+		.msg = msg,
+	};
+	size_t total_written = 0, pkt_off = 0, written;
+	u16 dgram_id;
+
+	/* The max size of a single dgram we support is 64KB */
+	if (dgram_len > VIRTIO_VSOCK_MAX_DGRAM_SIZE)
+		return -EMSGSIZE;
+
+	info.dgram_len = dgram_len;
+	vsk->remote_addr = *remote_addr;
+
+	dgram_id = trans->dgram_id++;
+
+	/* TODO: To optimize, if we have enough credit to send the pkt already,
+	 * do not ask the peer to send credit to use  */
+	virtio_transport_send_credit_request(vsk, VIRTIO_VSOCK_TYPE_DGRAM);
+
+	while (total_written < dgram_len) {
+		info.pkt_len = dgram_len - total_written;
+		info.flags = dgram_id << 16 | pkt_off;
+		written = trans->ops->send_pkt(vsk, &info);
+		if (written < 0)
+			return -ENOMEM;
+		if (written == 0) {
+			/* TODO: if written = 0, we need a sleep & wakeup
+			 * instead of sleep */
+			pr_debug("%s: SHOULD WAIT written==0", __func__);
+			msleep(10);
+		}
+		total_written += written;
+		pkt_off += written;
+		pr_debug("%s:id=%d, dgram_len=%zu, off=%zu, total_written=%zu, written=%zu\n",
+			  __func__, dgram_id, dgram_len, pkt_off, total_written, written);
+	}
+
+	return dgram_len;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+				struct msghdr *msg,
+				size_t len)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RW,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.msg = msg,
+		.pkt_len = len,
+	};
+
+	return trans->ops->send_pkt(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue);
+
+void virtio_transport_destruct(struct vsock_sock *vsk)
+{
+	struct virtio_transport *trans = vsk->trans;
+
+	pr_debug("%s: vsk=%p\n", __func__, vsk);
+	kfree(trans);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_destruct);
+
+static int virtio_transport_send_ack(struct vsock_sock *vsk, u32 cookie)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_ACK,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.flags = cpu_to_le32(cookie),
+	};
+
+	pr_debug("%s: sk=%p send_offer\n", __func__, vsk);
+	return trans->ops->send_pkt(vsk, &info);
+}
+
+static int virtio_transport_send_reset(struct vsock_sock *vsk,
+				       struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RST,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+	};
+
+	pr_debug("%s\n", __func__);
+
+	/* Send RST only if the original pkt is not a RST pkt */
+	if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+		return 0;
+
+	return trans->ops->send_pkt(vsk, &info);
+}
+
+static int
+virtio_transport_recv_connecting(struct sock *sk,
+				 struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	int err;
+	int skerr;
+	u32 cookie;
+
+	pr_debug("%s: vsk=%p\n", __func__, vsk);
+	switch (le16_to_cpu(pkt->hdr.op)) {
+	case VIRTIO_VSOCK_OP_RESPONSE:
+		cookie = le32_to_cpu(pkt->hdr.flags);
+		pr_debug("%s: got RESPONSE and send ACK, cookie=%x\n", __func__, cookie);
+		err = virtio_transport_send_ack(vsk, cookie);
+		if (err < 0) {
+			skerr = -err;
+			goto destroy;
+		}
+		sk->sk_state = SS_CONNECTED;
+		sk->sk_socket->state = SS_CONNECTED;
+		vsock_insert_connected(vsk);
+		sk->sk_state_change(sk);
+		break;
+	case VIRTIO_VSOCK_OP_INVALID:
+		pr_debug("%s: got invalid\n", __func__);
+		break;
+	case VIRTIO_VSOCK_OP_RST:
+		pr_debug("%s: got rst\n", __func__);
+		skerr = ECONNRESET;
+		err = 0;
+		goto destroy;
+	default:
+		pr_debug("%s: got def\n", __func__);
+		skerr = EPROTO;
+		err = -EINVAL;
+		goto destroy;
+	}
+	return 0;
+
+destroy:
+	virtio_transport_send_reset(vsk, pkt);
+	sk->sk_state = SS_UNCONNECTED;
+	sk->sk_err = skerr;
+	sk->sk_error_report(sk);
+	return err;
+}
+
+static int
+virtio_transport_recv_connected(struct sock *sk,
+				struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct virtio_transport *trans = vsk->trans;
+	int err = 0;
+
+	switch (le16_to_cpu(pkt->hdr.op)) {
+	case VIRTIO_VSOCK_OP_RW:
+		pkt->len = le32_to_cpu(pkt->hdr.len);
+		pkt->off = 0;
+		pkt->trans = trans;
+
+		mutex_lock(&trans->rx_lock);
+		virtio_transport_inc_rx_pkt(pkt);
+		list_add_tail(&pkt->list, &trans->rx_queue);
+		mutex_unlock(&trans->rx_lock);
+
+		sk->sk_data_ready(sk);
+		return err;
+	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
+		sk->sk_write_space(sk);
+		break;
+	case VIRTIO_VSOCK_OP_SHUTDOWN:
+		pr_debug("%s: got shutdown\n", __func__);
+		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
+			vsk->peer_shutdown |= RCV_SHUTDOWN;
+		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
+			vsk->peer_shutdown |= SEND_SHUTDOWN;
+		if (le32_to_cpu(pkt->hdr.flags))
+			sk->sk_state_change(sk);
+		break;
+	case VIRTIO_VSOCK_OP_RST:
+		pr_debug("%s: got rst\n", __func__);
+		sock_set_flag(sk, SOCK_DONE);
+		vsk->peer_shutdown = SHUTDOWN_MASK;
+		if (vsock_stream_has_data(vsk) <= 0)
+			sk->sk_state = SS_DISCONNECTING;
+		sk->sk_state_change(sk);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	virtio_transport_free_pkt(pkt);
+	return err;
+}
+
+static int
+virtio_transport_send_response(struct vsock_sock *vsk,
+			       struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_transport *trans = vsk->trans;
+	struct virtio_vsock_pkt_info info = {
+		.op = VIRTIO_VSOCK_OP_RESPONSE,
+		.type = VIRTIO_VSOCK_TYPE_STREAM,
+		.remote_cid = le32_to_cpu(pkt->hdr.src_cid),
+		.remote_port = le32_to_cpu(pkt->hdr.src_port),
+	};
+	u32 cookie;
+
+	cookie = virtio_vsock_secure_cookie(le32_to_cpu(pkt->hdr.src_cid),
+					    le32_to_cpu(pkt->hdr.dst_cid),
+					    le32_to_cpu(pkt->hdr.src_port),
+					    le32_to_cpu(pkt->hdr.dst_port),
+					    jiffies / (HZ * 60));
+	info.flags = cpu_to_le32(cookie);
+
+	pr_debug("%s: send_response, cookie=%x\n", __func__, le32_to_cpu(cookie));
+
+	return trans->ops->send_pkt(vsk, &info);
+}
+
+/* Handle server socket */
+static int
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct vsock_sock *vpending;
+	struct sock *pending;
+	int err;
+	u32 cookie;
+
+	switch (le16_to_cpu(pkt->hdr.op)) {
+	case VIRTIO_VSOCK_OP_REQUEST:
+		err = virtio_transport_send_response(vsk, pkt);
+		if (err < 0) {
+			// FIXME vsk should be vpending
+			virtio_transport_send_reset(vsk, pkt);
+			return err;
+		}
+		break;
+	case VIRTIO_VSOCK_OP_ACK:
+		cookie = le32_to_cpu(pkt->hdr.flags);
+		err = virtio_vsock_check_cookie(le32_to_cpu(pkt->hdr.src_cid),
+						le32_to_cpu(pkt->hdr.dst_cid),
+						le32_to_cpu(pkt->hdr.src_port),
+						le32_to_cpu(pkt->hdr.dst_port),
+						jiffies / (HZ * 60),
+						le32_to_cpu(pkt->hdr.flags),
+						VSOCK_TIMEOUT_INIT);
+		pr_debug("%s: cookie=%x, err=%d\n", __func__, cookie, err);
+		if (err)
+			return err;
+
+		/* So no pending socket are responsible for this pkt, create one */
+		pr_debug("%s: create pending\n", __func__);
+		pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
+				sk->sk_type, 0);
+		if (!pending) {
+			virtio_transport_send_reset(vsk, pkt);
+			return -ENOMEM;
+		}
+		sk->sk_ack_backlog++;
+		pending->sk_state = SS_CONNECTING;
+
+		vpending = vsock_sk(pending);
+		vsock_addr_init(&vpending->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
+				le32_to_cpu(pkt->hdr.dst_port));
+		vsock_addr_init(&vpending->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
+				le32_to_cpu(pkt->hdr.src_port));
+		vsock_add_pending(sk, pending);
+
+		pr_debug("%s: get pending\n", __func__);
+		pending = virtio_transport_get_pending(sk, pkt);
+		vpending = vsock_sk(pending);
+		lock_sock(pending);
+		switch (pending->sk_state) {
+			case SS_CONNECTING:
+				if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_ACK) {
+					pr_debug("%s: op=%d != OP_ACK\n", __func__,
+						 le16_to_cpu(pkt->hdr.op));
+					virtio_transport_send_reset(vpending, pkt);
+					pending->sk_err = EPROTO;
+					pending->sk_state = SS_UNCONNECTED;
+					sock_put(pending);
+				} else {
+					pending->sk_state = SS_CONNECTED;
+					vsock_insert_connected(vpending);
+
+					vsock_remove_pending(sk, pending);
+					vsock_enqueue_accept(sk, pending);
+
+					sk->sk_data_ready(sk);
+				}
+				err = 0;
+				break;
+			default:
+				pr_debug("%s: sk->sk_ack_backlog=%d\n", __func__,
+						sk->sk_ack_backlog);
+				virtio_transport_send_reset(vpending, pkt);
+				err = -EINVAL;
+				break;
+		}
+		if (err < 0)
+			vsock_remove_pending(sk, pending);
+		release_sock(pending);
+
+		/* Release refcnt obtained in virtio_transport_get_pending */
+		sock_put(pending);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static void virtio_transport_space_update(struct sock *sk,
+					  struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct virtio_transport *trans = vsk->trans;
+	bool space_available;
+
+	/* buf_alloc and fwd_cnt is always included in the hdr */
+	mutex_lock(&trans->tx_lock);
+	trans->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+	trans->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+	space_available = virtio_transport_has_space(vsk);
+	mutex_unlock(&trans->tx_lock);
+
+	if (space_available)
+		sk->sk_write_space(sk);
+}
+
+/* We are under the virtio-vsock's vsock->rx_lock or
+ * vhost-vsock's vq->mutex lock */
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+{
+	struct virtio_transport *trans;
+	struct sockaddr_vm src, dst;
+	struct vsock_sock *vsk;
+	struct sock *sk;
+
+	vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port));
+	vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), le32_to_cpu(pkt->hdr.dst_port));
+
+	virtio_vsock_dumppkt(__func__, pkt);
+
+	if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_DGRAM) {
+		sk = vsock_find_unbound_socket(&dst);
+		if (!sk)
+			goto free_pkt;
+
+		vsk = vsock_sk(sk);
+		trans = vsk->trans;
+		BUG_ON(!trans);
+
+		virtio_transport_space_update(sk, pkt);
+
+		lock_sock(sk);
+		switch (le16_to_cpu(pkt->hdr.op)) {
+		case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
+			virtio_transport_free_pkt(pkt);
+			break;
+		case VIRTIO_VSOCK_OP_CREDIT_REQUEST:
+			virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_DGRAM,
+							    &pkt->hdr);
+			virtio_transport_free_pkt(pkt);
+			break;
+		case VIRTIO_VSOCK_OP_RW:
+			virtio_transport_recv_dgram(sk, pkt);
+			break;
+		default:
+			virtio_transport_free_pkt(pkt);
+			break;
+		}
+		release_sock(sk);
+
+		/* Release refcnt obtained when we fetched this socket out of
+		 * the unbound list.
+		 */
+		sock_put(sk);
+		return;
+	} else if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM) {
+		/* The socket must be in connected or bound table
+		 * otherwise send reset back
+		 */
+		sk = vsock_find_connected_socket(&src, &dst);
+		if (!sk) {
+			sk = vsock_find_bound_socket(&dst);
+			if (!sk) {
+				pr_debug("%s: can not find bound_socket\n", __func__);
+				virtio_vsock_dumppkt(__func__, pkt);
+				/* Ignore this pkt instead of sending reset back */
+				/* TODO send a RST unless this packet is a RST (to avoid infinite loops) */
+				goto free_pkt;
+			}
+		}
+
+		vsk = vsock_sk(sk);
+		trans = vsk->trans;
+		BUG_ON(!trans);
+
+		virtio_transport_space_update(sk, pkt);
+
+		lock_sock(sk);
+		switch (sk->sk_state) {
+		case VSOCK_SS_LISTEN:
+			virtio_transport_recv_listen(sk, pkt);
+			virtio_transport_free_pkt(pkt);
+			break;
+		case SS_CONNECTING:
+			virtio_transport_recv_connecting(sk, pkt);
+			virtio_transport_free_pkt(pkt);
+			break;
+		case SS_CONNECTED:
+			virtio_transport_recv_connected(sk, pkt);
+			break;
+		default:
+			virtio_transport_free_pkt(pkt);
+			break;
+		}
+		release_sock(sk);
+
+		/* Release refcnt obtained when we fetched this socket out of the
+		 * bound or connected list.
+		 */
+		sock_put(sk);
+	}
+	return;
+
+free_pkt:
+	virtio_transport_free_pkt(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
+
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
+{
+	kfree(pkt->buf);
+	kfree(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
+
+static int __init virtio_vsock_common_init(void)
+{
+	get_random_bytes(vsockcookie_secret, sizeof(vsockcookie_secret));
+	return 0;
+}
+
+static void __exit virtio_vsock_common_exit(void)
+{
+}
+
+module_init(virtio_vsock_common_init);
+module_exit(virtio_vsock_common_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("common code for virtio vsock");
-- 
cgit v1.2.3-71-gd317


From 2f8364a291e8adde25c93f97a76abbcaf4b1ed3f Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 3 Dec 2015 21:12:31 +0100
Subject: WAN: HDLC: Call notifiers before and after changing device type

An HDLC device can change type when the protocol driver is changed.
Calling the notifier change allows potential users of the interface
know about this planned change, and even block it. After the change
has occurred, send a second notification to users can evaluate the new
device type etc.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/hdlc.c         | 19 +++++++++++++++++--
 drivers/net/wan/hdlc_cisco.c   |  1 +
 drivers/net/wan/hdlc_fr.c      |  1 +
 drivers/net/wan/hdlc_ppp.c     |  1 +
 drivers/net/wan/hdlc_raw.c     |  1 +
 drivers/net/wan/hdlc_raw_eth.c |  1 +
 drivers/net/wan/hdlc_x25.c     |  1 +
 include/linux/hdlc.h           |  2 +-
 8 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c
index 2a6595b4ae15..9bd4aa8083ce 100644
--- a/drivers/net/wan/hdlc.c
+++ b/drivers/net/wan/hdlc.c
@@ -276,7 +276,11 @@ void unregister_hdlc_device(struct net_device *dev)
 int attach_hdlc_protocol(struct net_device *dev, struct hdlc_proto *proto,
 			 size_t size)
 {
-	detach_hdlc_protocol(dev);
+	int err;
+
+	err = detach_hdlc_protocol(dev);
+	if (err)
+		return err;
 
 	if (!try_module_get(proto->module))
 		return -ENOSYS;
@@ -289,15 +293,24 @@ int attach_hdlc_protocol(struct net_device *dev, struct hdlc_proto *proto,
 		}
 	}
 	dev_to_hdlc(dev)->proto = proto;
+
 	return 0;
 }
 
 
-void detach_hdlc_protocol(struct net_device *dev)
+int detach_hdlc_protocol(struct net_device *dev)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
+	int err;
 
 	if (hdlc->proto) {
+		err = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, dev);
+		err = notifier_to_errno(err);
+		if (err) {
+			netdev_err(dev, "Refused to change device type\n");
+			return err;
+		}
+
 		if (hdlc->proto->detach)
 			hdlc->proto->detach(dev);
 		module_put(hdlc->proto->module);
@@ -306,6 +319,8 @@ void detach_hdlc_protocol(struct net_device *dev)
 	kfree(hdlc->state);
 	hdlc->state = NULL;
 	hdlc_setup_dev(dev);
+
+	return 0;
 }
 
 
diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c
index 3f20808b5ff8..a408abc25512 100644
--- a/drivers/net/wan/hdlc_cisco.c
+++ b/drivers/net/wan/hdlc_cisco.c
@@ -378,6 +378,7 @@ static int cisco_ioctl(struct net_device *dev, struct ifreq *ifr)
 		spin_lock_init(&state(hdlc)->lock);
 		dev->header_ops = &cisco_header_ops;
 		dev->type = ARPHRD_CISCO;
+		call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev);
 		netif_dormant_on(dev);
 		return 0;
 	}
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index 89541cc90e87..b6e0cfb095d3 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -1240,6 +1240,7 @@ static int fr_ioctl(struct net_device *dev, struct ifreq *ifr)
 		}
 		memcpy(&state(hdlc)->settings, &new_settings, size);
 		dev->type = ARPHRD_FRAD;
+		call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev);
 		return 0;
 
 	case IF_PROTO_FR_ADD_PVC:
diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c
index 0d7645581f91..47fdb87d3567 100644
--- a/drivers/net/wan/hdlc_ppp.c
+++ b/drivers/net/wan/hdlc_ppp.c
@@ -687,6 +687,7 @@ static int ppp_ioctl(struct net_device *dev, struct ifreq *ifr)
 		dev->hard_header_len = sizeof(struct hdlc_header);
 		dev->header_ops = &ppp_header_ops;
 		dev->type = ARPHRD_PPP;
+		call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev);
 		netif_dormant_on(dev);
 		return 0;
 	}
diff --git a/drivers/net/wan/hdlc_raw.c b/drivers/net/wan/hdlc_raw.c
index 5dc153e8a29d..4feb45001aac 100644
--- a/drivers/net/wan/hdlc_raw.c
+++ b/drivers/net/wan/hdlc_raw.c
@@ -84,6 +84,7 @@ static int raw_ioctl(struct net_device *dev, struct ifreq *ifr)
 			return result;
 		memcpy(hdlc->state, &new_settings, size);
 		dev->type = ARPHRD_RAWHDLC;
+		call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev);
 		netif_dormant_off(dev);
 		return 0;
 	}
diff --git a/drivers/net/wan/hdlc_raw_eth.c b/drivers/net/wan/hdlc_raw_eth.c
index 3ab72b3082de..2f11836078ab 100644
--- a/drivers/net/wan/hdlc_raw_eth.c
+++ b/drivers/net/wan/hdlc_raw_eth.c
@@ -102,6 +102,7 @@ static int raw_eth_ioctl(struct net_device *dev, struct ifreq *ifr)
 		ether_setup(dev);
 		dev->tx_queue_len = old_qlen;
 		eth_hw_addr_random(dev);
+		call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev);
 		netif_dormant_off(dev);
 		return 0;
 	}
diff --git a/drivers/net/wan/hdlc_x25.c b/drivers/net/wan/hdlc_x25.c
index a49aec5efd20..e867638067a6 100644
--- a/drivers/net/wan/hdlc_x25.c
+++ b/drivers/net/wan/hdlc_x25.c
@@ -213,6 +213,7 @@ static int x25_ioctl(struct net_device *dev, struct ifreq *ifr)
 		if ((result = attach_hdlc_protocol(dev, &proto, 0)))
 			return result;
 		dev->type = ARPHRD_X25;
+		call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev);
 		netif_dormant_off(dev);
 		return 0;
 	}
diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h
index 1acb1445e05f..e31bcd4c7859 100644
--- a/include/linux/hdlc.h
+++ b/include/linux/hdlc.h
@@ -101,7 +101,7 @@ netdev_tx_t hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev);
 int attach_hdlc_protocol(struct net_device *dev, struct hdlc_proto *proto,
 			 size_t size);
 /* May be used by hardware driver to gain control over HDLC device */
-void detach_hdlc_protocol(struct net_device *dev);
+int detach_hdlc_protocol(struct net_device *dev);
 
 static __inline__ __be16 hdlc_type_trans(struct sk_buff *skb,
 					 struct net_device *dev)
-- 
cgit v1.2.3-71-gd317


From b618aaa91b5870e7bd139987ac4b7bf0851142d0 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 4 Dec 2015 15:01:31 +0100
Subject: net: constify netif_is_* helpers net_device param

As suggested by Eric, these helpers should have const dev param.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h   |  2 +-
 include/linux/netdevice.h | 22 +++++++++++-----------
 net/core/dev.c            |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 67ce5bd3b56a..05f5879821b8 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -73,7 +73,7 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
 /* found in socket.c */
 extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *));
 
-static inline bool is_vlan_dev(struct net_device *dev)
+static inline bool is_vlan_dev(const struct net_device *dev)
 {
         return dev->priv_flags & IFF_802_1Q_VLAN;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3efe017fe419..1bb21ff0fa64 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3661,7 +3661,7 @@ extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
 void netdev_rss_key_fill(void *buffer, size_t len);
 
 int dev_get_nest_level(struct net_device *dev,
-		       bool (*type_check)(struct net_device *dev));
+		       bool (*type_check)(const struct net_device *dev));
 int skb_checksum_help(struct sk_buff *skb);
 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 				  netdev_features_t features, bool tx_path);
@@ -3858,32 +3858,32 @@ static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
 	skb->mac_len = mac_len;
 }
 
-static inline bool netif_is_macvlan(struct net_device *dev)
+static inline bool netif_is_macvlan(const struct net_device *dev)
 {
 	return dev->priv_flags & IFF_MACVLAN;
 }
 
-static inline bool netif_is_macvlan_port(struct net_device *dev)
+static inline bool netif_is_macvlan_port(const struct net_device *dev)
 {
 	return dev->priv_flags & IFF_MACVLAN_PORT;
 }
 
-static inline bool netif_is_ipvlan(struct net_device *dev)
+static inline bool netif_is_ipvlan(const struct net_device *dev)
 {
 	return dev->priv_flags & IFF_IPVLAN_SLAVE;
 }
 
-static inline bool netif_is_ipvlan_port(struct net_device *dev)
+static inline bool netif_is_ipvlan_port(const struct net_device *dev)
 {
 	return dev->priv_flags & IFF_IPVLAN_MASTER;
 }
 
-static inline bool netif_is_bond_master(struct net_device *dev)
+static inline bool netif_is_bond_master(const struct net_device *dev)
 {
 	return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
 }
 
-static inline bool netif_is_bond_slave(struct net_device *dev)
+static inline bool netif_is_bond_slave(const struct net_device *dev)
 {
 	return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING;
 }
@@ -3918,22 +3918,22 @@ static inline bool netif_is_ovs_master(const struct net_device *dev)
 	return dev->priv_flags & IFF_OPENVSWITCH;
 }
 
-static inline bool netif_is_team_master(struct net_device *dev)
+static inline bool netif_is_team_master(const struct net_device *dev)
 {
 	return dev->priv_flags & IFF_TEAM;
 }
 
-static inline bool netif_is_team_port(struct net_device *dev)
+static inline bool netif_is_team_port(const struct net_device *dev)
 {
 	return dev->priv_flags & IFF_TEAM_PORT;
 }
 
-static inline bool netif_is_lag_master(struct net_device *dev)
+static inline bool netif_is_lag_master(const struct net_device *dev)
 {
 	return netif_is_bond_master(dev) || netif_is_team_master(dev);
 }
 
-static inline bool netif_is_lag_port(struct net_device *dev)
+static inline bool netif_is_lag_port(const struct net_device *dev)
 {
 	return netif_is_bond_slave(dev) || netif_is_team_port(dev);
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index d1706e88fbeb..e5c395473eba 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5734,7 +5734,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private);
 
 
 int dev_get_nest_level(struct net_device *dev,
-		       bool (*type_check)(struct net_device *dev))
+		       bool (*type_check)(const struct net_device *dev))
 {
 	struct net_device *lower = NULL;
 	struct list_head *iter;
-- 
cgit v1.2.3-71-gd317


From 5f61385d2ebc2bd62bc389c7da0d8d2f263be1eb Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Sun, 6 Dec 2015 18:07:41 +0200
Subject: net/mlx4_core: Keep VLAN/MAC tables mirrored in multifunc HA mode

Due to HW limitations, indexes to MAC and VLAN tables are always taken
from the table of the actual port. So, if a resource holds an index to
a table, it may refer to different values during the lifetime of the
resource,  unless the tables are mirrored. Also, even when
driver is not in HA mode the policy of allocating an index to these
tables is such to make sure, as much as possible, that when the time
comes the mirroring will be successful. This means that in multifunction
mode the allocation of a free index in a port's table tries to make sure
that the same index in the other's port table is also free.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |   6 +
 drivers/net/ethernet/mellanox/mlx4/port.c | 598 ++++++++++++++++++++++++++++--
 include/linux/mlx4/driver.h               |   5 +
 3 files changed, 586 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 33c4c6f2c4bb..2404c22ad2b2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -736,6 +736,7 @@ struct mlx4_catas_err {
 struct mlx4_mac_table {
 	__be64			entries[MLX4_MAX_MAC_NUM];
 	int			refs[MLX4_MAX_MAC_NUM];
+	bool			is_dup[MLX4_MAX_MAC_NUM];
 	struct mutex		mutex;
 	int			total;
 	int			max;
@@ -758,6 +759,7 @@ struct mlx4_roce_gid_table {
 struct mlx4_vlan_table {
 	__be32			entries[MLX4_MAX_VLAN_NUM];
 	int			refs[MLX4_MAX_VLAN_NUM];
+	int			is_dup[MLX4_MAX_VLAN_NUM];
 	struct mutex		mutex;
 	int			total;
 	int			max;
@@ -1225,6 +1227,10 @@ void mlx4_init_roce_gid_table(struct mlx4_dev *dev,
 			      struct mlx4_roce_gid_table *table);
 void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan);
 int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+int mlx4_bond_vlan_table(struct mlx4_dev *dev);
+int mlx4_unbond_vlan_table(struct mlx4_dev *dev);
+int mlx4_bond_mac_table(struct mlx4_dev *dev);
+int mlx4_unbond_mac_table(struct mlx4_dev *dev);
 
 int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz);
 /* resource tracker functions*/
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index c2b21313dba7..f2550425c251 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -61,6 +61,7 @@ void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table)
 	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
 		table->entries[i] = 0;
 		table->refs[i]	 = 0;
+		table->is_dup[i] = false;
 	}
 	table->max   = 1 << dev->caps.log_num_macs;
 	table->total = 0;
@@ -74,6 +75,7 @@ void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table)
 	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
 		table->entries[i] = 0;
 		table->refs[i]	 = 0;
+		table->is_dup[i] = false;
 	}
 	table->max   = (1 << dev->caps.log_num_vlans) - MLX4_VLAN_REGULAR;
 	table->total = 0;
@@ -159,21 +161,94 @@ int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx)
 }
 EXPORT_SYMBOL_GPL(mlx4_find_cached_mac);
 
+static bool mlx4_need_mf_bond(struct mlx4_dev *dev)
+{
+	int i, num_eth_ports = 0;
+
+	if (!mlx4_is_mfunc(dev))
+		return false;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		++num_eth_ports;
+
+	return (num_eth_ports ==  2) ? true : false;
+}
+
 int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 {
 	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
 	struct mlx4_mac_table *table = &info->mac_table;
 	int i, err = 0;
 	int free = -1;
+	int free_for_dup = -1;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
+	bool need_mf_bond = mlx4_need_mf_bond(dev);
+	bool can_mf_bond = true;
+
+	mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d %s duplicate\n",
+		 (unsigned long long)mac, port,
+		 dup ? "with" : "without");
+
+	if (need_mf_bond) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock(&dup_table->mutex);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock(&table->mutex);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	if (need_mf_bond) {
+		int index_at_port = -1;
+		int index_at_dup_port = -1;
 
-	mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d\n",
-		 (unsigned long long) mac, port);
+		for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+			if (((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))))
+				index_at_port = i;
+			if (((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(dup_table->entries[i]))))
+				index_at_dup_port = i;
+		}
+
+		/* check that same mac is not in the tables at different indices */
+		if ((index_at_port != index_at_dup_port) &&
+		    (index_at_port >= 0) &&
+		    (index_at_dup_port >= 0))
+			can_mf_bond = false;
+
+		/* If the mac is already in the primary table, the slot must be
+		 * available in the duplicate table as well.
+		 */
+		if (index_at_port >= 0 && index_at_dup_port < 0 &&
+		    dup_table->refs[index_at_port]) {
+			can_mf_bond = false;
+		}
+		/* If the mac is already in the duplicate table, check that the
+		 * corresponding index is not occupied in the primary table, or
+		 * the primary table already contains the mac at the same index.
+		 * Otherwise, you cannot bond (primary contains a different mac
+		 * at that index).
+		 */
+		if (index_at_dup_port >= 0) {
+			if (!table->refs[index_at_dup_port] ||
+			    ((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(table->entries[index_at_dup_port]))))
+				free_for_dup = index_at_dup_port;
+			else
+				can_mf_bond = false;
+		}
+	}
 
-	mutex_lock(&table->mutex);
 	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
 		if (!table->refs[i]) {
 			if (free < 0)
 				free = i;
+			if (free_for_dup < 0 && need_mf_bond && can_mf_bond) {
+				if (!dup_table->refs[i])
+					free_for_dup = i;
+			}
 			continue;
 		}
 
@@ -182,10 +257,30 @@ int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 			/* MAC already registered, increment ref count */
 			err = i;
 			++table->refs[i];
+			if (dup) {
+				u64 dup_mac = MLX4_MAC_MASK & be64_to_cpu(dup_table->entries[i]);
+
+				if (dup_mac != mac || !dup_table->is_dup[i]) {
+					mlx4_warn(dev, "register mac: expect duplicate mac 0x%llx on port %d index %d\n",
+						  mac, dup_port, i);
+				}
+			}
 			goto out;
 		}
 	}
 
+	if (need_mf_bond && (free_for_dup < 0)) {
+		if (dup) {
+			mlx4_warn(dev, "Fail to allocate duplicate MAC table entry\n");
+			mlx4_warn(dev, "High Availability for virtual functions may not work as expected\n");
+			dup = false;
+		}
+		can_mf_bond = false;
+	}
+
+	if (need_mf_bond && can_mf_bond)
+		free = free_for_dup;
+
 	mlx4_dbg(dev, "Free MAC index is %d\n", free);
 
 	if (table->total == table->max) {
@@ -205,10 +300,35 @@ int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 		goto out;
 	}
 	table->refs[free] = 1;
-	err = free;
+	table->is_dup[free] = false;
 	++table->total;
+	if (dup) {
+		dup_table->refs[free] = 0;
+		dup_table->is_dup[free] = true;
+		dup_table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+		err = mlx4_set_port_mac_table(dev, dup_port, dup_table->entries);
+		if (unlikely(err)) {
+			mlx4_warn(dev, "Failed adding duplicate mac: 0x%llx\n", mac);
+			dup_table->is_dup[free] = false;
+			dup_table->entries[free] = 0;
+			goto out;
+		}
+		++dup_table->total;
+	}
+	err = free;
 out:
-	mutex_unlock(&table->mutex);
+	if (need_mf_bond) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
 	return err;
 }
 EXPORT_SYMBOL_GPL(__mlx4_register_mac);
@@ -255,6 +375,9 @@ void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 	struct mlx4_port_info *info;
 	struct mlx4_mac_table *table;
 	int index;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
 
 	if (port < 1 || port > dev->caps.num_ports) {
 		mlx4_warn(dev, "invalid port number (%d), aborting...\n", port);
@@ -262,22 +385,59 @@ void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 	}
 	info = &mlx4_priv(dev)->port[port];
 	table = &info->mac_table;
-	mutex_lock(&table->mutex);
+
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock(&dup_table->mutex);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock(&table->mutex);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
 	index = find_index(dev, table, mac);
 
 	if (validate_index(dev, table, index))
 		goto out;
-	if (--table->refs[index]) {
+
+	if (--table->refs[index] || table->is_dup[index]) {
 		mlx4_dbg(dev, "Have more references for index %d, no need to modify mac table\n",
 			 index);
+		if (!table->refs[index])
+			dup_table->is_dup[index] = false;
 		goto out;
 	}
 
 	table->entries[index] = 0;
-	mlx4_set_port_mac_table(dev, port, table->entries);
+	if (mlx4_set_port_mac_table(dev, port, table->entries))
+		mlx4_warn(dev, "Fail to set mac in port %d during unregister\n", port);
 	--table->total;
+
+	if (dup) {
+		dup_table->is_dup[index] = false;
+		if (dup_table->refs[index])
+			goto out;
+		dup_table->entries[index] = 0;
+		if (mlx4_set_port_mac_table(dev, dup_port, dup_table->entries))
+			mlx4_warn(dev, "Fail to set mac in duplicate port %d during unregister\n", dup_port);
+
+		--table->total;
+	}
 out:
-	mutex_unlock(&table->mutex);
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
 }
 EXPORT_SYMBOL_GPL(__mlx4_unregister_mac);
 
@@ -311,9 +471,22 @@ int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
 	struct mlx4_mac_table *table = &info->mac_table;
 	int index = qpn - info->base_qpn;
 	int err = 0;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
 
 	/* CX1 doesn't support multi-functions */
-	mutex_lock(&table->mutex);
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock(&dup_table->mutex);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock(&table->mutex);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
 
 	err = validate_index(dev, table, index);
 	if (err)
@@ -326,9 +499,30 @@ int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
 		mlx4_err(dev, "Failed adding MAC: 0x%llx\n",
 			 (unsigned long long) new_mac);
 		table->entries[index] = 0;
+	} else {
+		if (dup) {
+			dup_table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID);
+
+			err = mlx4_set_port_mac_table(dev, dup_port, dup_table->entries);
+			if (unlikely(err)) {
+				mlx4_err(dev, "Failed adding duplicate MAC: 0x%llx\n",
+					 (unsigned long long)new_mac);
+				dup_table->entries[index] = 0;
+			}
+		}
 	}
 out:
-	mutex_unlock(&table->mutex);
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
 	return err;
 }
 EXPORT_SYMBOL_GPL(__mlx4_replace_mac);
@@ -380,8 +574,28 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
 	int i, err = 0;
 	int free = -1;
-
-	mutex_lock(&table->mutex);
+	int free_for_dup = -1;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_vlan_table *dup_table = &mlx4_priv(dev)->port[dup_port].vlan_table;
+	bool need_mf_bond = mlx4_need_mf_bond(dev);
+	bool can_mf_bond = true;
+
+	mlx4_dbg(dev, "Registering VLAN: %d for port %d %s duplicate\n",
+		 vlan, port,
+		 dup ? "with" : "without");
+
+	if (need_mf_bond) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock(&dup_table->mutex);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock(&table->mutex);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
 
 	if (table->total == table->max) {
 		/* No free vlan entries */
@@ -389,22 +603,85 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 		goto out;
 	}
 
+	if (need_mf_bond) {
+		int index_at_port = -1;
+		int index_at_dup_port = -1;
+
+		for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+			if ((vlan == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i]))))
+				index_at_port = i;
+			if ((vlan == (MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[i]))))
+				index_at_dup_port = i;
+		}
+		/* check that same vlan is not in the tables at different indices */
+		if ((index_at_port != index_at_dup_port) &&
+		    (index_at_port >= 0) &&
+		    (index_at_dup_port >= 0))
+			can_mf_bond = false;
+
+		/* If the vlan is already in the primary table, the slot must be
+		 * available in the duplicate table as well.
+		 */
+		if (index_at_port >= 0 && index_at_dup_port < 0 &&
+		    dup_table->refs[index_at_port]) {
+			can_mf_bond = false;
+		}
+		/* If the vlan is already in the duplicate table, check that the
+		 * corresponding index is not occupied in the primary table, or
+		 * the primary table already contains the vlan at the same index.
+		 * Otherwise, you cannot bond (primary contains a different vlan
+		 * at that index).
+		 */
+		if (index_at_dup_port >= 0) {
+			if (!table->refs[index_at_dup_port] ||
+			    (vlan == (MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[index_at_dup_port]))))
+				free_for_dup = index_at_dup_port;
+			else
+				can_mf_bond = false;
+		}
+	}
+
 	for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
-		if (free < 0 && (table->refs[i] == 0)) {
-			free = i;
-			continue;
+		if (!table->refs[i]) {
+			if (free < 0)
+				free = i;
+			if (free_for_dup < 0 && need_mf_bond && can_mf_bond) {
+				if (!dup_table->refs[i])
+					free_for_dup = i;
+			}
 		}
 
-		if (table->refs[i] &&
+		if ((table->refs[i] || table->is_dup[i]) &&
 		    (vlan == (MLX4_VLAN_MASK &
 			      be32_to_cpu(table->entries[i])))) {
 			/* Vlan already registered, increase references count */
+			mlx4_dbg(dev, "vlan %u is already registered.\n", vlan);
 			*index = i;
 			++table->refs[i];
+			if (dup) {
+				u16 dup_vlan = MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[i]);
+
+				if (dup_vlan != vlan || !dup_table->is_dup[i]) {
+					mlx4_warn(dev, "register vlan: expected duplicate vlan %u on port %d index %d\n",
+						  vlan, dup_port, i);
+				}
+			}
 			goto out;
 		}
 	}
 
+	if (need_mf_bond && (free_for_dup < 0)) {
+		if (dup) {
+			mlx4_warn(dev, "Fail to allocate duplicate VLAN table entry\n");
+			mlx4_warn(dev, "High Availability for virtual functions may not work as expected\n");
+			dup = false;
+		}
+		can_mf_bond = false;
+	}
+
+	if (need_mf_bond && can_mf_bond)
+		free = free_for_dup;
+
 	if (free < 0) {
 		err = -ENOMEM;
 		goto out;
@@ -412,6 +689,7 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 
 	/* Register new VLAN */
 	table->refs[free] = 1;
+	table->is_dup[free] = false;
 	table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
 
 	err = mlx4_set_port_vlan_table(dev, port, table->entries);
@@ -421,11 +699,35 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 		table->entries[free] = 0;
 		goto out;
 	}
+	++table->total;
+	if (dup) {
+		dup_table->refs[free] = 0;
+		dup_table->is_dup[free] = true;
+		dup_table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+		err = mlx4_set_port_vlan_table(dev, dup_port, dup_table->entries);
+		if (unlikely(err)) {
+			mlx4_warn(dev, "Failed adding duplicate vlan: %u\n", vlan);
+			dup_table->is_dup[free] = false;
+			dup_table->entries[free] = 0;
+			goto out;
+		}
+		++dup_table->total;
+	}
 
 	*index = free;
-	++table->total;
 out:
-	mutex_unlock(&table->mutex);
+	if (need_mf_bond) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
 	return err;
 }
 
@@ -455,8 +757,22 @@ void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
 {
 	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
 	int index;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_vlan_table *dup_table = &mlx4_priv(dev)->port[dup_port].vlan_table;
+
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock(&dup_table->mutex);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock(&table->mutex);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
 
-	mutex_lock(&table->mutex);
 	if (mlx4_find_cached_vlan(dev, port, vlan, &index)) {
 		mlx4_warn(dev, "vlan 0x%x is not in the vlan table\n", vlan);
 		goto out;
@@ -467,16 +783,38 @@ void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
 		goto out;
 	}
 
-	if (--table->refs[index]) {
+	if (--table->refs[index] || table->is_dup[index]) {
 		mlx4_dbg(dev, "Have %d more references for index %d, no need to modify vlan table\n",
 			 table->refs[index], index);
+		if (!table->refs[index])
+			dup_table->is_dup[index] = false;
 		goto out;
 	}
 	table->entries[index] = 0;
-	mlx4_set_port_vlan_table(dev, port, table->entries);
+	if (mlx4_set_port_vlan_table(dev, port, table->entries))
+		mlx4_warn(dev, "Fail to set vlan in port %d during unregister\n", port);
 	--table->total;
+	if (dup) {
+		dup_table->is_dup[index] = false;
+		if (dup_table->refs[index])
+			goto out;
+		dup_table->entries[index] = 0;
+		if (mlx4_set_port_vlan_table(dev, dup_port, dup_table->entries))
+			mlx4_warn(dev, "Fail to set vlan in duplicate port %d during unregister\n", dup_port);
+		--dup_table->total;
+	}
 out:
-	mutex_unlock(&table->mutex);
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
 }
 
 void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
@@ -495,6 +833,220 @@ void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
 }
 EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
 
+int mlx4_bond_mac_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mac_table *t1 = &mlx4_priv(dev)->port[1].mac_table;
+	struct mlx4_mac_table *t2 = &mlx4_priv(dev)->port[2].mac_table;
+	int ret = 0;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if ((t1->entries[i] != t2->entries[i]) &&
+		    t1->entries[i] && t2->entries[i]) {
+			mlx4_warn(dev, "can't duplicate entry %d in mac table\n", i);
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (t1->entries[i] && !t2->entries[i]) {
+			t2->entries[i] = t1->entries[i];
+			t2->is_dup[i] = true;
+			update2 = true;
+		} else if (!t1->entries[i] && t2->entries[i]) {
+			t1->entries[i] = t2->entries[i];
+			t1->is_dup[i] = true;
+			update1 = true;
+		} else if (t1->entries[i] && t2->entries[i]) {
+			t1->is_dup[i] = true;
+			t2->is_dup[i] = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_mac_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set MAC table for port 1 (%d)\n", ret);
+	}
+	if (!ret && update2) {
+		ret = mlx4_set_port_mac_table(dev, 2, t2->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set MAC table for port 2 (%d)\n", ret);
+	}
+
+	if (ret)
+		mlx4_warn(dev, "failed to create mirror MAC tables\n");
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_unbond_mac_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mac_table *t1 = &mlx4_priv(dev)->port[1].mac_table;
+	struct mlx4_mac_table *t2 = &mlx4_priv(dev)->port[2].mac_table;
+	int ret = 0;
+	int ret1;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (t1->entries[i] != t2->entries[i]) {
+			mlx4_warn(dev, "mac table is in an unexpected state when trying to unbond\n");
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!t1->entries[i])
+			continue;
+		t1->is_dup[i] = false;
+		if (!t1->refs[i]) {
+			t1->entries[i] = 0;
+			update1 = true;
+		}
+		t2->is_dup[i] = false;
+		if (!t2->refs[i]) {
+			t2->entries[i] = 0;
+			update2 = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_mac_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to unmirror MAC tables for port 1(%d)\n", ret);
+	}
+	if (update2) {
+		ret1 = mlx4_set_port_mac_table(dev, 2, t2->entries);
+		if (ret1) {
+			mlx4_warn(dev, "failed to unmirror MAC tables for port 2(%d)\n", ret1);
+			ret = ret1;
+		}
+	}
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_bond_vlan_table(struct mlx4_dev *dev)
+{
+	struct mlx4_vlan_table *t1 = &mlx4_priv(dev)->port[1].vlan_table;
+	struct mlx4_vlan_table *t2 = &mlx4_priv(dev)->port[2].vlan_table;
+	int ret = 0;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if ((t1->entries[i] != t2->entries[i]) &&
+		    t1->entries[i] && t2->entries[i]) {
+			mlx4_warn(dev, "can't duplicate entry %d in vlan table\n", i);
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (t1->entries[i] && !t2->entries[i]) {
+			t2->entries[i] = t1->entries[i];
+			t2->is_dup[i] = true;
+			update2 = true;
+		} else if (!t1->entries[i] && t2->entries[i]) {
+			t1->entries[i] = t2->entries[i];
+			t1->is_dup[i] = true;
+			update1 = true;
+		} else if (t1->entries[i] && t2->entries[i]) {
+			t1->is_dup[i] = true;
+			t2->is_dup[i] = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_vlan_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set VLAN table for port 1 (%d)\n", ret);
+	}
+	if (!ret && update2) {
+		ret = mlx4_set_port_vlan_table(dev, 2, t2->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set VLAN table for port 2 (%d)\n", ret);
+	}
+
+	if (ret)
+		mlx4_warn(dev, "failed to create mirror VLAN tables\n");
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_unbond_vlan_table(struct mlx4_dev *dev)
+{
+	struct mlx4_vlan_table *t1 = &mlx4_priv(dev)->port[1].vlan_table;
+	struct mlx4_vlan_table *t2 = &mlx4_priv(dev)->port[2].vlan_table;
+	int ret = 0;
+	int ret1;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (t1->entries[i] != t2->entries[i]) {
+			mlx4_warn(dev, "vlan table is in an unexpected state when trying to unbond\n");
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (!t1->entries[i])
+			continue;
+		t1->is_dup[i] = false;
+		if (!t1->refs[i]) {
+			t1->entries[i] = 0;
+			update1 = true;
+		}
+		t2->is_dup[i] = false;
+		if (!t2->refs[i]) {
+			t2->entries[i] = 0;
+			update2 = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_vlan_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to unmirror VLAN tables for port 1(%d)\n", ret);
+	}
+	if (update2) {
+		ret1 = mlx4_set_port_vlan_table(dev, 2, t2->entries);
+		if (ret1) {
+			mlx4_warn(dev, "failed to unmirror VLAN tables for port 2(%d)\n", ret1);
+			ret = ret1;
+		}
+	}
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
 int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
 {
 	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index 5a06d969338e..2e8af001c5da 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -75,6 +75,11 @@ static inline int mlx4_is_bonded(struct mlx4_dev *dev)
 	return !!(dev->flags & MLX4_FLAG_BONDED);
 }
 
+static inline int mlx4_is_mf_bonded(struct mlx4_dev *dev)
+{
+	return (mlx4_is_bonded(dev) && mlx4_is_mfunc(dev));
+}
+
 struct mlx4_port_map {
 	u8	port1;
 	u8	port2;
-- 
cgit v1.2.3-71-gd317


From ea3793ee29d3621faf857fa8ef5425e9ff9a756d Mon Sep 17 00:00:00 2001
From: Rainer Weikusat <rweikusat@mobileactivedefense.com>
Date: Sun, 6 Dec 2015 21:11:34 +0000
Subject: core: enable more fine-grained datagram reception control

The __skb_recv_datagram routine in core/ datagram.c provides a general
skb reception factility supposed to be utilized by protocol modules
providing datagram sockets. It encompasses both the actual recvmsg code
and a surrounding 'sleep until data is available' loop. This is
inconvenient if a protocol module has to use additional locking in order
to maintain some per-socket state the generic datagram socket code is
unaware of (as the af_unix code does). The patch below moves the recvmsg
proper code into a new __skb_try_recv_datagram routine which doesn't
sleep and renames wait_for_more_packets to
__skb_wait_for_more_packets, both routines being exported interfaces. The
original __skb_recv_datagram routine is reimplemented on top of these
two functions such that its user-visible behaviour remains unchanged.

Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  6 ++++
 net/core/datagram.c    | 77 +++++++++++++++++++++++++++++++-------------------
 2 files changed, 54 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c9c394bf0771..9b9b9ead7bb3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2785,6 +2785,12 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
 #define skb_walk_frags(skb, iter)	\
 	for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
 
+
+int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
+				const struct sk_buff *skb);
+struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
+					int *peeked, int *off, int *err,
+					struct sk_buff **last);
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
 				    int *peeked, int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d62af69ad844..7daff66d3d0b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -83,8 +83,8 @@ static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int syn
 /*
  * Wait for the last received packet to be different from skb
  */
-static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
-				 const struct sk_buff *skb)
+int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
+				const struct sk_buff *skb)
 {
 	int error;
 	DEFINE_WAIT_FUNC(wait, receiver_wake_function);
@@ -130,6 +130,7 @@ out_noerr:
 	error = 1;
 	goto out;
 }
+EXPORT_SYMBOL(__skb_wait_for_more_packets);
 
 static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
 {
@@ -161,13 +162,15 @@ done:
 }
 
 /**
- *	__skb_recv_datagram - Receive a datagram skbuff
+ *	__skb_try_recv_datagram - Receive a datagram skbuff
  *	@sk: socket
  *	@flags: MSG_ flags
  *	@peeked: returns non-zero if this packet has been seen before
  *	@off: an offset in bytes to peek skb from. Returns an offset
  *	      within an skb where data actually starts
  *	@err: error code returned
+ *	@last: set to last peeked message to inform the wait function
+ *	       what to look for when peeking
  *
  *	Get a datagram skbuff, understands the peeking, nonblocking wakeups
  *	and possible races. This replaces identical code in packet, raw and
@@ -175,9 +178,11 @@ done:
  *	the long standing peek and read race for datagram sockets. If you
  *	alter this routine remember it must be re-entrant.
  *
- *	This function will lock the socket if a skb is returned, so the caller
- *	needs to unlock the socket in that case (usually by calling
- *	skb_free_datagram)
+ *	This function will lock the socket if a skb is returned, so
+ *	the caller needs to unlock the socket in that case (usually by
+ *	calling skb_free_datagram). Returns NULL with *err set to
+ *	-EAGAIN if no data was available or to some other value if an
+ *	error was detected.
  *
  *	* It does not lock socket since today. This function is
  *	* free of race conditions. This measure should/can improve
@@ -191,13 +196,13 @@ done:
  *	quite explicitly by POSIX 1003.1g, don't change them without having
  *	the standard around please.
  */
-struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
-				    int *peeked, int *off, int *err)
+struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+					int *peeked, int *off, int *err,
+					struct sk_buff **last)
 {
 	struct sk_buff_head *queue = &sk->sk_receive_queue;
-	struct sk_buff *skb, *last;
+	struct sk_buff *skb;
 	unsigned long cpu_flags;
-	long timeo;
 	/*
 	 * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
 	 */
@@ -206,8 +211,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 	if (error)
 		goto no_packet;
 
-	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
 	do {
 		/* Again only user level code calls this function, so nothing
 		 * interrupt level will suddenly eat the receive_queue.
@@ -217,10 +220,10 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 		 */
 		int _off = *off;
 
-		last = (struct sk_buff *)queue;
+		*last = (struct sk_buff *)queue;
 		spin_lock_irqsave(&queue->lock, cpu_flags);
 		skb_queue_walk(queue, skb) {
-			last = skb;
+			*last = skb;
 			*peeked = skb->peeked;
 			if (flags & MSG_PEEK) {
 				if (_off >= skb->len && (skb->len || _off ||
@@ -231,8 +234,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 
 				skb = skb_set_peeked(skb);
 				error = PTR_ERR(skb);
-				if (IS_ERR(skb))
-					goto unlock_err;
+				if (IS_ERR(skb)) {
+					spin_unlock_irqrestore(&queue->lock,
+							       cpu_flags);
+					goto no_packet;
+				}
 
 				atomic_inc(&skb->users);
 			} else
@@ -242,25 +248,38 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 			*off = _off;
 			return skb;
 		}
+
 		spin_unlock_irqrestore(&queue->lock, cpu_flags);
+	} while (sk_can_busy_loop(sk) &&
+		 sk_busy_loop(sk, flags & MSG_DONTWAIT));
 
-		if (sk_can_busy_loop(sk) &&
-		    sk_busy_loop(sk, flags & MSG_DONTWAIT))
-			continue;
+	error = -EAGAIN;
 
-		/* User doesn't want to wait */
-		error = -EAGAIN;
-		if (!timeo)
-			goto no_packet;
+no_packet:
+	*err = error;
+	return NULL;
+}
+EXPORT_SYMBOL(__skb_try_recv_datagram);
 
-	} while (!wait_for_more_packets(sk, err, &timeo, last));
+struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+				    int *peeked, int *off, int *err)
+{
+	struct sk_buff *skb, *last;
+	long timeo;
 
-	return NULL;
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	do {
+		skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
+					      &last);
+		if (skb)
+			return skb;
+
+		if (*err != EAGAIN)
+			break;
+	} while (timeo &&
+		!__skb_wait_for_more_packets(sk, err, &timeo, last));
 
-unlock_err:
-	spin_unlock_irqrestore(&queue->lock, cpu_flags);
-no_packet:
-	*err = error;
 	return NULL;
 }
 EXPORT_SYMBOL(__skb_recv_datagram);
-- 
cgit v1.2.3-71-gd317


From 8ac2837c89c8c0fcad557e4380aeef80580390f9 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 9 Dec 2015 10:51:12 +0800
Subject: Revert "Merge branch 'vsock-virtio'"

This reverts commit 0d76d6e8b2507983a2cae4c09880798079007421 and merge
commit c402293bd76fbc93e52ef8c0947ab81eea3ae019, reversing changes made
to c89359a42e2a49656451569c382eed63e781153c.

The virtio-vsock device specification is not finalized yet.  Michael
Tsirkin voiced concerned about merging this code when the hardware
interface (and possibly the userspace interface) could still change.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/Kconfig                   |    4 -
 drivers/vhost/Kconfig.vsock             |    7 -
 drivers/vhost/Makefile                  |    4 -
 drivers/vhost/vsock.c                   |  630 ---------------
 drivers/vhost/vsock.h                   |    4 -
 include/linux/virtio_vsock.h            |  209 -----
 include/net/af_vsock.h                  |    2 -
 include/uapi/linux/virtio_ids.h         |    1 -
 include/uapi/linux/virtio_vsock.h       |   89 ---
 net/vmw_vsock/Kconfig                   |   18 -
 net/vmw_vsock/Makefile                  |    2 -
 net/vmw_vsock/af_vsock.c                |   70 --
 net/vmw_vsock/virtio_transport.c        |  466 -----------
 net/vmw_vsock/virtio_transport_common.c | 1272 -------------------------------
 14 files changed, 2778 deletions(-)
 delete mode 100644 drivers/vhost/Kconfig.vsock
 delete mode 100644 drivers/vhost/vsock.c
 delete mode 100644 drivers/vhost/vsock.h
 delete mode 100644 include/linux/virtio_vsock.h
 delete mode 100644 include/uapi/linux/virtio_vsock.h
 delete mode 100644 net/vmw_vsock/virtio_transport.c
 delete mode 100644 net/vmw_vsock/virtio_transport_common.c

(limited to 'include/linux')

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 81449bfc8d3b..533eaf04f12f 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -47,7 +47,3 @@ config VHOST_CROSS_ENDIAN_LEGACY
 	  adds some overhead, it is disabled by default.
 
 	  If unsure, say "N".
-
-if STAGING
-source "drivers/vhost/Kconfig.vsock"
-endif
diff --git a/drivers/vhost/Kconfig.vsock b/drivers/vhost/Kconfig.vsock
deleted file mode 100644
index 3491865d3eb9..000000000000
--- a/drivers/vhost/Kconfig.vsock
+++ /dev/null
@@ -1,7 +0,0 @@
-config VHOST_VSOCK
-	tristate "vhost virtio-vsock driver"
-	depends on VSOCKETS && EVENTFD
-	select VIRTIO_VSOCKETS_COMMON
-	default n
-	---help---
-	Say M here to enable the vhost-vsock for virtio-vsock guests
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 6b012b986b57..e0441c34db1c 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -4,9 +4,5 @@ vhost_net-y := net.o
 obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
 vhost_scsi-y := scsi.o
 
-obj-$(CONFIG_VHOST_VSOCK) += vhost_vsock.o
-vhost_vsock-y := vsock.o
-
 obj-$(CONFIG_VHOST_RING) += vringh.o
-
 obj-$(CONFIG_VHOST)	+= vhost.o
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
deleted file mode 100644
index 64bcb10bb901..000000000000
--- a/drivers/vhost/vsock.c
+++ /dev/null
@@ -1,630 +0,0 @@
-/*
- * vhost transport for vsock
- *
- * Copyright (C) 2013-2015 Red Hat, Inc.
- * Author: Asias He <asias@redhat.com>
- *         Stefan Hajnoczi <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- */
-#include <linux/miscdevice.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <net/sock.h>
-#include <linux/virtio_vsock.h>
-#include <linux/vhost.h>
-
-#include <net/af_vsock.h>
-#include "vhost.h"
-#include "vsock.h"
-
-#define VHOST_VSOCK_DEFAULT_HOST_CID	2
-
-static int vhost_transport_socket_init(struct vsock_sock *vsk,
-				       struct vsock_sock *psk);
-
-enum {
-	VHOST_VSOCK_FEATURES = VHOST_FEATURES,
-};
-
-/* Used to track all the vhost_vsock instances on the system. */
-static LIST_HEAD(vhost_vsock_list);
-static DEFINE_MUTEX(vhost_vsock_mutex);
-
-struct vhost_vsock_virtqueue {
-	struct vhost_virtqueue vq;
-};
-
-struct vhost_vsock {
-	/* Vhost device */
-	struct vhost_dev dev;
-	/* Vhost vsock virtqueue*/
-	struct vhost_vsock_virtqueue vqs[VSOCK_VQ_MAX];
-	/* Link to global vhost_vsock_list*/
-	struct list_head list;
-	/* Head for pkt from host to guest */
-	struct list_head send_pkt_list;
-	/* Work item to send pkt */
-	struct vhost_work send_pkt_work;
-	/* Wait queue for send pkt */
-	wait_queue_head_t queue_wait;
-	/* Used for global tx buf limitation */
-	u32 total_tx_buf;
-	/* Guest contex id this vhost_vsock instance handles */
-	u32 guest_cid;
-};
-
-static u32 vhost_transport_get_local_cid(void)
-{
-	return VHOST_VSOCK_DEFAULT_HOST_CID;
-}
-
-static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
-{
-	struct vhost_vsock *vsock;
-
-	mutex_lock(&vhost_vsock_mutex);
-	list_for_each_entry(vsock, &vhost_vsock_list, list) {
-		if (vsock->guest_cid == guest_cid) {
-			mutex_unlock(&vhost_vsock_mutex);
-			return vsock;
-		}
-	}
-	mutex_unlock(&vhost_vsock_mutex);
-
-	return NULL;
-}
-
-static void
-vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
-			    struct vhost_virtqueue *vq)
-{
-	bool added = false;
-
-	mutex_lock(&vq->mutex);
-	vhost_disable_notify(&vsock->dev, vq);
-	for (;;) {
-		struct virtio_vsock_pkt *pkt;
-		struct iov_iter iov_iter;
-		unsigned out, in;
-		struct sock *sk;
-		size_t nbytes;
-		size_t len;
-		int head;
-
-		if (list_empty(&vsock->send_pkt_list)) {
-			vhost_enable_notify(&vsock->dev, vq);
-			break;
-		}
-
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		pr_debug("%s: head = %d\n", __func__, head);
-		if (head < 0)
-			break;
-
-		if (head == vq->num) {
-			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
-				vhost_disable_notify(&vsock->dev, vq);
-				continue;
-			}
-			break;
-		}
-
-		pkt = list_first_entry(&vsock->send_pkt_list,
-				       struct virtio_vsock_pkt, list);
-		list_del_init(&pkt->list);
-
-		if (out) {
-			virtio_transport_free_pkt(pkt);
-			vq_err(vq, "Expected 0 output buffers, got %u\n", out);
-			break;
-		}
-
-		len = iov_length(&vq->iov[out], in);
-		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
-
-		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
-		if (nbytes != sizeof(pkt->hdr)) {
-			virtio_transport_free_pkt(pkt);
-			vq_err(vq, "Faulted on copying pkt hdr\n");
-			break;
-		}
-
-		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
-		if (nbytes != pkt->len) {
-			virtio_transport_free_pkt(pkt);
-			vq_err(vq, "Faulted on copying pkt buf\n");
-			break;
-		}
-
-		vhost_add_used(vq, head, pkt->len); /* TODO should this be sizeof(pkt->hdr) + pkt->len? */
-		added = true;
-
-		virtio_transport_dec_tx_pkt(pkt);
-		vsock->total_tx_buf -= pkt->len;
-
-		sk = sk_vsock(pkt->trans->vsk);
-		/* Release refcnt taken in vhost_transport_send_pkt */
-		sock_put(sk);
-
-		virtio_transport_free_pkt(pkt);
-	}
-	if (added)
-		vhost_signal(&vsock->dev, vq);
-	mutex_unlock(&vq->mutex);
-
-	if (added)
-		wake_up(&vsock->queue_wait);
-}
-
-static void vhost_transport_send_pkt_work(struct vhost_work *work)
-{
-	struct vhost_virtqueue *vq;
-	struct vhost_vsock *vsock;
-
-	vsock = container_of(work, struct vhost_vsock, send_pkt_work);
-	vq = &vsock->vqs[VSOCK_VQ_RX].vq;
-
-	vhost_transport_do_send_pkt(vsock, vq);
-}
-
-static int
-vhost_transport_send_pkt(struct vsock_sock *vsk,
-			 struct virtio_vsock_pkt_info *info)
-{
-	u32 src_cid, src_port, dst_cid, dst_port;
-	struct virtio_transport *trans;
-	struct virtio_vsock_pkt *pkt;
-	struct vhost_virtqueue *vq;
-	struct vhost_vsock *vsock;
-	u32 pkt_len = info->pkt_len;
-	DEFINE_WAIT(wait);
-
-	src_cid = vhost_transport_get_local_cid();
-	src_port = vsk->local_addr.svm_port;
-	if (!info->remote_cid) {
-		dst_cid	= vsk->remote_addr.svm_cid;
-		dst_port = vsk->remote_addr.svm_port;
-	} else {
-		dst_cid = info->remote_cid;
-		dst_port = info->remote_port;
-	}
-
-	/* Find the vhost_vsock according to guest context id  */
-	vsock = vhost_vsock_get(dst_cid);
-	if (!vsock)
-		return -ENODEV;
-
-	trans = vsk->trans;
-	vq = &vsock->vqs[VSOCK_VQ_RX].vq;
-
-	/* we can send less than pkt_len bytes */
-	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
-		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
-
-	/* virtio_transport_get_credit might return less than pkt_len credit */
-	pkt_len = virtio_transport_get_credit(trans, pkt_len);
-
-	/* Do not send zero length OP_RW pkt*/
-	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
-		return pkt_len;
-
-	/* Respect global tx buf limitation */
-	mutex_lock(&vq->mutex);
-	while (pkt_len + vsock->total_tx_buf > VIRTIO_VSOCK_MAX_TX_BUF_SIZE) {
-		prepare_to_wait_exclusive(&vsock->queue_wait, &wait,
-					  TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&vq->mutex);
-		schedule();
-		mutex_lock(&vq->mutex);
-		finish_wait(&vsock->queue_wait, &wait);
-	}
-	vsock->total_tx_buf += pkt_len;
-	mutex_unlock(&vq->mutex);
-
-	pkt = virtio_transport_alloc_pkt(vsk, info, pkt_len,
-					 src_cid, src_port,
-					 dst_cid, dst_port);
-	if (!pkt) {
-		mutex_lock(&vq->mutex);
-		vsock->total_tx_buf -= pkt_len;
-		mutex_unlock(&vq->mutex);
-		virtio_transport_put_credit(trans, pkt_len);
-		return -ENOMEM;
-	}
-
-	pr_debug("%s:info->pkt_len= %d\n", __func__, pkt_len);
-	/* Released in vhost_transport_do_send_pkt */
-	sock_hold(&trans->vsk->sk);
-	virtio_transport_inc_tx_pkt(pkt);
-
-	/* Queue it up in vhost work */
-	mutex_lock(&vq->mutex);
-	list_add_tail(&pkt->list, &vsock->send_pkt_list);
-	vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
-	mutex_unlock(&vq->mutex);
-
-	return pkt_len;
-}
-
-static struct virtio_transport_pkt_ops vhost_ops = {
-	.send_pkt = vhost_transport_send_pkt,
-};
-
-static struct virtio_vsock_pkt *
-vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
-		      unsigned int out, unsigned int in)
-{
-	struct virtio_vsock_pkt *pkt;
-	struct iov_iter iov_iter;
-	size_t nbytes;
-	size_t len;
-
-	if (in != 0) {
-		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
-		return NULL;
-	}
-
-	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
-	if (!pkt)
-		return NULL;
-
-	len = iov_length(vq->iov, out);
-	iov_iter_init(&iov_iter, WRITE, vq->iov, out, len);
-
-	nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
-	if (nbytes != sizeof(pkt->hdr)) {
-		vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n",
-		       sizeof(pkt->hdr), nbytes);
-		kfree(pkt);
-		return NULL;
-	}
-
-	if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_DGRAM)
-		pkt->len = le32_to_cpu(pkt->hdr.len) & 0XFFFF;
-	else if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM)
-		pkt->len = le32_to_cpu(pkt->hdr.len);
-
-	/* No payload */
-	if (!pkt->len)
-		return pkt;
-
-	/* The pkt is too big */
-	if (pkt->len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) {
-		kfree(pkt);
-		return NULL;
-	}
-
-	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
-	if (!pkt->buf) {
-		kfree(pkt);
-		return NULL;
-	}
-
-	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
-	if (nbytes != pkt->len) {
-		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
-		       pkt->len, nbytes);
-		virtio_transport_free_pkt(pkt);
-		return NULL;
-	}
-
-	return pkt;
-}
-
-static void vhost_vsock_handle_ctl_kick(struct vhost_work *work)
-{
-	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
-						  poll.work);
-	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
-						 dev);
-
-	pr_debug("%s vq=%p, vsock=%p\n", __func__, vq, vsock);
-}
-
-static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
-{
-	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
-						  poll.work);
-	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
-						 dev);
-	struct virtio_vsock_pkt *pkt;
-	int head;
-	unsigned int out, in;
-	bool added = false;
-	u32 len;
-
-	mutex_lock(&vq->mutex);
-	vhost_disable_notify(&vsock->dev, vq);
-	for (;;) {
-		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
-					 &out, &in, NULL, NULL);
-		if (head < 0)
-			break;
-
-		if (head == vq->num) {
-			if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
-				vhost_disable_notify(&vsock->dev, vq);
-				continue;
-			}
-			break;
-		}
-
-		pkt = vhost_vsock_alloc_pkt(vq, out, in);
-		if (!pkt) {
-			vq_err(vq, "Faulted on pkt\n");
-			continue;
-		}
-
-		len = pkt->len;
-
-		/* Only accept correctly addressed packets */
-		if (le32_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid &&
-		    le32_to_cpu(pkt->hdr.dst_cid) == vhost_transport_get_local_cid())
-			virtio_transport_recv_pkt(pkt);
-		else
-			virtio_transport_free_pkt(pkt);
-
-		vhost_add_used(vq, head, len);
-		added = true;
-	}
-	if (added)
-		vhost_signal(&vsock->dev, vq);
-	mutex_unlock(&vq->mutex);
-}
-
-static void vhost_vsock_handle_rx_kick(struct vhost_work *work)
-{
-	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
-						poll.work);
-	struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
-						 dev);
-
-	vhost_transport_do_send_pkt(vsock, vq);
-}
-
-static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
-{
-	struct vhost_virtqueue **vqs;
-	struct vhost_vsock *vsock;
-	int ret;
-
-	vsock = kzalloc(sizeof(*vsock), GFP_KERNEL);
-	if (!vsock)
-		return -ENOMEM;
-
-	pr_debug("%s:vsock=%p\n", __func__, vsock);
-
-	vqs = kmalloc(VSOCK_VQ_MAX * sizeof(*vqs), GFP_KERNEL);
-	if (!vqs) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	vqs[VSOCK_VQ_CTRL] = &vsock->vqs[VSOCK_VQ_CTRL].vq;
-	vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX].vq;
-	vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX].vq;
-	vsock->vqs[VSOCK_VQ_CTRL].vq.handle_kick = vhost_vsock_handle_ctl_kick;
-	vsock->vqs[VSOCK_VQ_TX].vq.handle_kick = vhost_vsock_handle_tx_kick;
-	vsock->vqs[VSOCK_VQ_RX].vq.handle_kick = vhost_vsock_handle_rx_kick;
-
-	vhost_dev_init(&vsock->dev, vqs, VSOCK_VQ_MAX);
-
-	file->private_data = vsock;
-	init_waitqueue_head(&vsock->queue_wait);
-	INIT_LIST_HEAD(&vsock->send_pkt_list);
-	vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work);
-
-	mutex_lock(&vhost_vsock_mutex);
-	list_add_tail(&vsock->list, &vhost_vsock_list);
-	mutex_unlock(&vhost_vsock_mutex);
-	return 0;
-
-out:
-	kfree(vsock);
-	return ret;
-}
-
-static void vhost_vsock_flush(struct vhost_vsock *vsock)
-{
-	int i;
-
-	for (i = 0; i < VSOCK_VQ_MAX; i++)
-		vhost_poll_flush(&vsock->vqs[i].vq.poll);
-	vhost_work_flush(&vsock->dev, &vsock->send_pkt_work);
-}
-
-static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
-{
-	struct vhost_vsock *vsock = file->private_data;
-
-	mutex_lock(&vhost_vsock_mutex);
-	list_del(&vsock->list);
-	mutex_unlock(&vhost_vsock_mutex);
-
-	vhost_dev_stop(&vsock->dev);
-	vhost_vsock_flush(vsock);
-	vhost_dev_cleanup(&vsock->dev, false);
-	kfree(vsock->dev.vqs);
-	kfree(vsock);
-	return 0;
-}
-
-static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u32 guest_cid)
-{
-	struct vhost_vsock *other;
-
-	/* Refuse reserved CIDs */
-	if (guest_cid <= VMADDR_CID_HOST) {
-		return -EINVAL;
-	}
-
-	/* Refuse if CID is already in use */
-	other = vhost_vsock_get(guest_cid);
-	if (other && other != vsock) {
-		return -EADDRINUSE;
-	}
-
-	mutex_lock(&vhost_vsock_mutex);
-	vsock->guest_cid = guest_cid;
-	pr_debug("%s:guest_cid=%d\n", __func__, guest_cid);
-	mutex_unlock(&vhost_vsock_mutex);
-
-	return 0;
-}
-
-static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
-{
-	struct vhost_virtqueue *vq;
-	int i;
-
-	if (features & ~VHOST_VSOCK_FEATURES)
-		return -EOPNOTSUPP;
-
-	mutex_lock(&vsock->dev.mutex);
-	if ((features & (1 << VHOST_F_LOG_ALL)) &&
-	    !vhost_log_access_ok(&vsock->dev)) {
-		mutex_unlock(&vsock->dev.mutex);
-		return -EFAULT;
-	}
-
-	for (i = 0; i < VSOCK_VQ_MAX; i++) {
-		vq = &vsock->vqs[i].vq;
-		mutex_lock(&vq->mutex);
-		vq->acked_features = features;
-		mutex_unlock(&vq->mutex);
-	}
-	mutex_unlock(&vsock->dev.mutex);
-	return 0;
-}
-
-static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
-				  unsigned long arg)
-{
-	struct vhost_vsock *vsock = f->private_data;
-	void __user *argp = (void __user *)arg;
-	u64 __user *featurep = argp;
-	u32 __user *cidp = argp;
-	u32 guest_cid;
-	u64 features;
-	int r;
-
-	switch (ioctl) {
-	case VHOST_VSOCK_SET_GUEST_CID:
-		if (get_user(guest_cid, cidp))
-			return -EFAULT;
-		return vhost_vsock_set_cid(vsock, guest_cid);
-	case VHOST_GET_FEATURES:
-		features = VHOST_VSOCK_FEATURES;
-		if (copy_to_user(featurep, &features, sizeof(features)))
-			return -EFAULT;
-		return 0;
-	case VHOST_SET_FEATURES:
-		if (copy_from_user(&features, featurep, sizeof(features)))
-			return -EFAULT;
-		return vhost_vsock_set_features(vsock, features);
-	default:
-		mutex_lock(&vsock->dev.mutex);
-		r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
-		if (r == -ENOIOCTLCMD)
-			r = vhost_vring_ioctl(&vsock->dev, ioctl, argp);
-		else
-			vhost_vsock_flush(vsock);
-		mutex_unlock(&vsock->dev.mutex);
-		return r;
-	}
-}
-
-static const struct file_operations vhost_vsock_fops = {
-	.owner          = THIS_MODULE,
-	.open           = vhost_vsock_dev_open,
-	.release        = vhost_vsock_dev_release,
-	.llseek		= noop_llseek,
-	.unlocked_ioctl = vhost_vsock_dev_ioctl,
-};
-
-static struct miscdevice vhost_vsock_misc = {
-	.minor = MISC_DYNAMIC_MINOR,
-	.name = "vhost-vsock",
-	.fops = &vhost_vsock_fops,
-};
-
-static int
-vhost_transport_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk)
-{
-	struct virtio_transport *trans;
-	int ret;
-
-	ret = virtio_transport_do_socket_init(vsk, psk);
-	if (ret)
-		return ret;
-
-	trans = vsk->trans;
-	trans->ops = &vhost_ops;
-
-	return ret;
-}
-
-static struct vsock_transport vhost_transport = {
-	.get_local_cid            = vhost_transport_get_local_cid,
-
-	.init                     = vhost_transport_socket_init,
-	.destruct                 = virtio_transport_destruct,
-	.release                  = virtio_transport_release,
-	.connect                  = virtio_transport_connect,
-	.shutdown                 = virtio_transport_shutdown,
-
-	.dgram_enqueue            = virtio_transport_dgram_enqueue,
-	.dgram_dequeue            = virtio_transport_dgram_dequeue,
-	.dgram_bind               = virtio_transport_dgram_bind,
-	.dgram_allow              = virtio_transport_dgram_allow,
-
-	.stream_enqueue           = virtio_transport_stream_enqueue,
-	.stream_dequeue           = virtio_transport_stream_dequeue,
-	.stream_has_data          = virtio_transport_stream_has_data,
-	.stream_has_space         = virtio_transport_stream_has_space,
-	.stream_rcvhiwat          = virtio_transport_stream_rcvhiwat,
-	.stream_is_active         = virtio_transport_stream_is_active,
-	.stream_allow             = virtio_transport_stream_allow,
-
-	.notify_poll_in           = virtio_transport_notify_poll_in,
-	.notify_poll_out          = virtio_transport_notify_poll_out,
-	.notify_recv_init         = virtio_transport_notify_recv_init,
-	.notify_recv_pre_block    = virtio_transport_notify_recv_pre_block,
-	.notify_recv_pre_dequeue  = virtio_transport_notify_recv_pre_dequeue,
-	.notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
-	.notify_send_init         = virtio_transport_notify_send_init,
-	.notify_send_pre_block    = virtio_transport_notify_send_pre_block,
-	.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
-	.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
-
-	.set_buffer_size          = virtio_transport_set_buffer_size,
-	.set_min_buffer_size      = virtio_transport_set_min_buffer_size,
-	.set_max_buffer_size      = virtio_transport_set_max_buffer_size,
-	.get_buffer_size          = virtio_transport_get_buffer_size,
-	.get_min_buffer_size      = virtio_transport_get_min_buffer_size,
-	.get_max_buffer_size      = virtio_transport_get_max_buffer_size,
-};
-
-static int __init vhost_vsock_init(void)
-{
-	int ret;
-
-	ret = vsock_core_init(&vhost_transport);
-	if (ret < 0)
-		return ret;
-	return misc_register(&vhost_vsock_misc);
-};
-
-static void __exit vhost_vsock_exit(void)
-{
-	misc_deregister(&vhost_vsock_misc);
-	vsock_core_exit();
-};
-
-module_init(vhost_vsock_init);
-module_exit(vhost_vsock_exit);
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Asias He");
-MODULE_DESCRIPTION("vhost transport for vsock ");
diff --git a/drivers/vhost/vsock.h b/drivers/vhost/vsock.h
deleted file mode 100644
index 0ddb107b86ca..000000000000
--- a/drivers/vhost/vsock.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef VHOST_VSOCK_H
-#define VHOST_VSOCK_H
-#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u32)
-#endif
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
deleted file mode 100644
index a5f3ecc038f7..000000000000
--- a/include/linux/virtio_vsock.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
- * anyone can use the definitions to implement compatible drivers/servers:
- *
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * Copyright (C) Red Hat, Inc., 2013-2015
- * Copyright (C) Asias He <asias@redhat.com>, 2013
- * Copyright (C) Stefan Hajnoczi <stefanha@redhat.com>, 2015
- */
-
-#ifndef _LINUX_VIRTIO_VSOCK_H
-#define _LINUX_VIRTIO_VSOCK_H
-
-#include <uapi/linux/virtio_vsock.h>
-#include <linux/socket.h>
-#include <net/sock.h>
-
-#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE	128
-#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE		(1024 * 256)
-#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE	(1024 * 256)
-#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
-#define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
-#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
-#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE		(1024 * 1024 * 16)
-#define VIRTIO_VSOCK_MAX_DGRAM_SIZE		(1024 * 64)
-
-struct vsock_transport_recv_notify_data;
-struct vsock_transport_send_notify_data;
-struct sockaddr_vm;
-struct vsock_sock;
-
-enum {
-	VSOCK_VQ_CTRL	= 0,
-	VSOCK_VQ_RX	= 1, /* for host to guest data */
-	VSOCK_VQ_TX	= 2, /* for guest to host data */
-	VSOCK_VQ_MAX	= 3,
-};
-
-/* virtio transport socket state */
-struct virtio_transport {
-	struct virtio_transport_pkt_ops	*ops;
-	struct vsock_sock *vsk;
-
-	u32 buf_size;
-	u32 buf_size_min;
-	u32 buf_size_max;
-
-	struct mutex tx_lock;
-	struct mutex rx_lock;
-
-	struct list_head rx_queue;
-	u32 rx_bytes;
-
-	/* Protected by trans->tx_lock */
-	u32 tx_cnt;
-	u32 buf_alloc;
-	u32 peer_fwd_cnt;
-	u32 peer_buf_alloc;
-	/* Protected by trans->rx_lock */
-	u32 fwd_cnt;
-
-	/* Protected by sk_lock */
-	u16 dgram_id;
-	struct list_head incomplete_dgrams; /* dgram fragments */
-};
-
-struct virtio_vsock_pkt {
-	struct virtio_vsock_hdr	hdr;
-	struct virtio_transport	*trans;
-	struct work_struct work;
-	struct list_head list;
-	void *buf;
-	u32 len;
-	u32 off;
-};
-
-struct virtio_vsock_pkt_info {
-	u32 remote_cid, remote_port;
-	struct msghdr *msg;
-	u32 pkt_len;
-	u16 type;
-	u16 op;
-	u32 flags;
-	u16 dgram_id;
-	u16 dgram_len;
-};
-
-struct virtio_transport_pkt_ops {
-	int (*send_pkt)(struct vsock_sock *vsk,
-			struct virtio_vsock_pkt_info *info);
-};
-
-void virtio_vsock_dumppkt(const char *func,
-			  const struct virtio_vsock_pkt *pkt);
-
-struct sock *
-virtio_transport_get_pending(struct sock *listener,
-			     struct virtio_vsock_pkt *pkt);
-struct virtio_vsock_pkt *
-virtio_transport_alloc_pkt(struct vsock_sock *vsk,
-			   struct virtio_vsock_pkt_info *info,
-			   size_t len,
-			   u32 src_cid,
-			   u32 src_port,
-			   u32 dst_cid,
-			   u32 dst_port);
-ssize_t
-virtio_transport_stream_dequeue(struct vsock_sock *vsk,
-				struct msghdr *msg,
-				size_t len,
-				int type);
-int
-virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
-			       struct msghdr *msg,
-			       size_t len, int flags);
-
-s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
-s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
-
-int virtio_transport_do_socket_init(struct vsock_sock *vsk,
-				 struct vsock_sock *psk);
-u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk);
-u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk);
-u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk);
-void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val);
-void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val);
-void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val);
-int
-virtio_transport_notify_poll_in(struct vsock_sock *vsk,
-				size_t target,
-				bool *data_ready_now);
-int
-virtio_transport_notify_poll_out(struct vsock_sock *vsk,
-				 size_t target,
-				 bool *space_available_now);
-
-int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
-	size_t target, struct vsock_transport_recv_notify_data *data);
-int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
-	size_t target, struct vsock_transport_recv_notify_data *data);
-int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
-	size_t target, struct vsock_transport_recv_notify_data *data);
-int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
-	size_t target, ssize_t copied, bool data_read,
-	struct vsock_transport_recv_notify_data *data);
-int virtio_transport_notify_send_init(struct vsock_sock *vsk,
-	struct vsock_transport_send_notify_data *data);
-int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
-	struct vsock_transport_send_notify_data *data);
-int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
-	struct vsock_transport_send_notify_data *data);
-int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
-	ssize_t written, struct vsock_transport_send_notify_data *data);
-
-u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
-bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
-bool virtio_transport_stream_allow(u32 cid, u32 port);
-int virtio_transport_dgram_bind(struct vsock_sock *vsk,
-				struct sockaddr_vm *addr);
-bool virtio_transport_dgram_allow(u32 cid, u32 port);
-
-int virtio_transport_connect(struct vsock_sock *vsk);
-
-int virtio_transport_shutdown(struct vsock_sock *vsk, int mode);
-
-void virtio_transport_release(struct vsock_sock *vsk);
-
-ssize_t
-virtio_transport_stream_enqueue(struct vsock_sock *vsk,
-				struct msghdr *msg,
-				size_t len);
-int
-virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
-			       struct sockaddr_vm *remote_addr,
-			       struct msghdr *msg,
-			       size_t len);
-
-void virtio_transport_destruct(struct vsock_sock *vsk);
-
-void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt);
-void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt);
-void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt);
-void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt);
-u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 wanted);
-void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit);
-#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index a0c8fa2ababf..e9eb2d6791b3 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -175,10 +175,8 @@ void vsock_insert_connected(struct vsock_sock *vsk);
 void vsock_remove_bound(struct vsock_sock *vsk);
 void vsock_remove_connected(struct vsock_sock *vsk);
 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
-struct sock *vsock_find_unbound_socket(struct sockaddr_vm *addr);
 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
 					 struct sockaddr_vm *dst);
 void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
-int vsock_bind_dgram_generic(struct vsock_sock *vsk, struct sockaddr_vm *addr);
 
 #endif /* __AF_VSOCK_H__ */
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 16dcf5d06cd7..77925f587b15 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -39,7 +39,6 @@
 #define VIRTIO_ID_9P		9 /* 9p virtio console */
 #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
 #define VIRTIO_ID_CAIF	       12 /* Virtio caif */
-#define VIRTIO_ID_VSOCK        13 /* virtio vsock transport */
 #define VIRTIO_ID_GPU          16 /* virtio GPU */
 #define VIRTIO_ID_INPUT        18 /* virtio input */
 
diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
deleted file mode 100644
index 8cf9b5682628..000000000000
--- a/include/uapi/linux/virtio_vsock.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so
- * anyone can use the definitions to implement compatible drivers/servers:
- *
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of IBM nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * Copyright (C) Red Hat, Inc., 2013-2015
- * Copyright (C) Asias He <asias@redhat.com>, 2013
- * Copyright (C) Stefan Hajnoczi <stefanha@redhat.com>, 2015
- */
-
-#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H
-#define _UAPI_LINUX_VIRTIO_VOSCK_H
-
-#include <linux/types.h>
-#include <linux/virtio_ids.h>
-#include <linux/virtio_config.h>
-
-struct virtio_vsock_config {
-	__le32 guest_cid;
-	__le32 max_virtqueue_pairs;
-};
-
-struct virtio_vsock_hdr {
-	__le32	src_cid;
-	__le32	src_port;
-	__le32	dst_cid;
-	__le32	dst_port;
-	__le32	len;
-	__le16	type;		/* enum virtio_vsock_type */
-	__le16	op;		/* enum virtio_vsock_op */
-	__le32	flags;
-	__le32	buf_alloc;
-	__le32	fwd_cnt;
-};
-
-enum virtio_vsock_type {
-	VIRTIO_VSOCK_TYPE_STREAM = 1,
-	VIRTIO_VSOCK_TYPE_DGRAM = 2,
-};
-
-enum virtio_vsock_op {
-	VIRTIO_VSOCK_OP_INVALID = 0,
-
-	/* Connect operations */
-	VIRTIO_VSOCK_OP_REQUEST = 1,
-	VIRTIO_VSOCK_OP_RESPONSE = 2,
-	VIRTIO_VSOCK_OP_ACK = 3,
-	VIRTIO_VSOCK_OP_RST = 4,
-	VIRTIO_VSOCK_OP_SHUTDOWN = 5,
-
-	/* To send payload */
-	VIRTIO_VSOCK_OP_RW = 6,
-
-	/* Tell the peer our credit info */
-	VIRTIO_VSOCK_OP_CREDIT_UPDATE = 7,
-	/* Request the peer to send the credit info to us */
-	VIRTIO_VSOCK_OP_CREDIT_REQUEST = 8,
-};
-
-/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */
-enum virtio_vsock_shutdown {
-	VIRTIO_VSOCK_SHUTDOWN_RCV = 1,
-	VIRTIO_VSOCK_SHUTDOWN_SEND = 2,
-};
-
-#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
index 74e0bc887a33..14810abedc2e 100644
--- a/net/vmw_vsock/Kconfig
+++ b/net/vmw_vsock/Kconfig
@@ -26,21 +26,3 @@ config VMWARE_VMCI_VSOCKETS
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called vmw_vsock_vmci_transport. If unsure, say N.
-
-config VIRTIO_VSOCKETS
-	tristate "virtio transport for Virtual Sockets"
-	depends on VSOCKETS && VIRTIO
-	select VIRTIO_VSOCKETS_COMMON
-	help
-	  This module implements a virtio transport for Virtual Sockets.
-
-	  Enable this transport if your Virtual Machine runs on Qemu/KVM.
-
-	  To compile this driver as a module, choose M here: the module
-	  will be called virtio_vsock_transport. If unsure, say N.
-
-config VIRTIO_VSOCKETS_COMMON
-       tristate
-       ---help---
-         This option is selected by any driver which needs to access
-         the virtio_vsock.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index cf4c29439081..2ce52d70f224 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -1,7 +1,5 @@
 obj-$(CONFIG_VSOCKETS) += vsock.o
 obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
-obj-$(CONFIG_VIRTIO_VSOCKETS) += virtio_transport.o
-obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += virtio_transport_common.o
 
 vsock-y += af_vsock.o vsock_addr.o
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 77247a2b670b..7fd1220fbfa0 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -223,17 +223,6 @@ static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
 	return NULL;
 }
 
-static struct sock *__vsock_find_unbound_socket(struct sockaddr_vm *addr)
-{
-	struct vsock_sock *vsk;
-
-	list_for_each_entry(vsk, vsock_unbound_sockets, bound_table)
-		if (addr->svm_port == vsk->local_addr.svm_port)
-			return sk_vsock(vsk);
-
-	return NULL;
-}
-
 static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
 						  struct sockaddr_vm *dst)
 {
@@ -309,21 +298,6 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
 }
 EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
 
-struct sock *vsock_find_unbound_socket(struct sockaddr_vm *addr)
-{
-	struct sock *sk;
-
-	spin_lock_bh(&vsock_table_lock);
-	sk = __vsock_find_unbound_socket(addr);
-	if (sk)
-		sock_hold(sk);
-
-	spin_unlock_bh(&vsock_table_lock);
-
-	return sk;
-}
-EXPORT_SYMBOL_GPL(vsock_find_unbound_socket);
-
 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
 					 struct sockaddr_vm *dst)
 {
@@ -558,50 +532,6 @@ static int __vsock_bind_stream(struct vsock_sock *vsk,
 	return 0;
 }
 
-int vsock_bind_dgram_generic(struct vsock_sock *vsk, struct sockaddr_vm *addr)
-{
-	static u32 port = LAST_RESERVED_PORT + 1;
-	struct sockaddr_vm new_addr;
-
-	vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
-
-	if (addr->svm_port == VMADDR_PORT_ANY) {
-		bool found = false;
-		unsigned int i;
-
-		for (i = 0; i < MAX_PORT_RETRIES; i++) {
-			if (port <= LAST_RESERVED_PORT)
-				port = LAST_RESERVED_PORT + 1;
-
-			new_addr.svm_port = port++;
-
-			if (!__vsock_find_unbound_socket(&new_addr)) {
-				found = true;
-				break;
-			}
-		}
-
-		if (!found)
-			return -EADDRNOTAVAIL;
-	} else {
-		/* If port is in reserved range, ensure caller
-		 * has necessary privileges.
-		 */
-		if (addr->svm_port <= LAST_RESERVED_PORT &&
-		    !capable(CAP_NET_BIND_SERVICE)) {
-			return -EACCES;
-		}
-
-		if (__vsock_find_unbound_socket(&new_addr))
-			return -EADDRINUSE;
-	}
-
-	vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(vsock_bind_dgram_generic);
-
 static int __vsock_bind_dgram(struct vsock_sock *vsk,
 			      struct sockaddr_vm *addr)
 {
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
deleted file mode 100644
index df65dca55fa1..000000000000
--- a/net/vmw_vsock/virtio_transport.c
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * virtio transport for vsock
- *
- * Copyright (C) 2013-2015 Red Hat, Inc.
- * Author: Asias He <asias@redhat.com>
- *         Stefan Hajnoczi <stefanha@redhat.com>
- *
- * Some of the code is take from Gerd Hoffmann <kraxel@redhat.com>'s
- * early virtio-vsock proof-of-concept bits.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- */
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/virtio.h>
-#include <linux/virtio_ids.h>
-#include <linux/virtio_config.h>
-#include <linux/virtio_vsock.h>
-#include <net/sock.h>
-#include <linux/mutex.h>
-#include <net/af_vsock.h>
-
-static struct workqueue_struct *virtio_vsock_workqueue;
-static struct virtio_vsock *the_virtio_vsock;
-static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */
-static void virtio_vsock_rx_fill(struct virtio_vsock *vsock);
-
-struct virtio_vsock {
-	/* Virtio device */
-	struct virtio_device *vdev;
-	/* Virtio virtqueue */
-	struct virtqueue *vqs[VSOCK_VQ_MAX];
-	/* Wait queue for send pkt */
-	wait_queue_head_t queue_wait;
-	/* Work item to send pkt */
-	struct work_struct tx_work;
-	/* Work item to recv pkt */
-	struct work_struct rx_work;
-	/* Mutex to protect send pkt*/
-	struct mutex tx_lock;
-	/* Mutex to protect recv pkt*/
-	struct mutex rx_lock;
-	/* Number of recv buffers */
-	int rx_buf_nr;
-	/* Number of max recv buffers */
-	int rx_buf_max_nr;
-	/* Used for global tx buf limitation */
-	u32 total_tx_buf;
-	/* Guest context id, just like guest ip address */
-	u32 guest_cid;
-};
-
-static struct virtio_vsock *virtio_vsock_get(void)
-{
-	return the_virtio_vsock;
-}
-
-static u32 virtio_transport_get_local_cid(void)
-{
-	struct virtio_vsock *vsock = virtio_vsock_get();
-
-	return vsock->guest_cid;
-}
-
-static int
-virtio_transport_send_pkt(struct vsock_sock *vsk,
-			  struct virtio_vsock_pkt_info *info)
-{
-	u32 src_cid, src_port, dst_cid, dst_port;
-	int ret, in_sg = 0, out_sg = 0;
-	struct virtio_transport *trans;
-	struct virtio_vsock_pkt *pkt;
-	struct virtio_vsock *vsock;
-	struct scatterlist hdr, buf, *sgs[2];
-	struct virtqueue *vq;
-	u32 pkt_len = info->pkt_len;
-	DEFINE_WAIT(wait);
-
-	vsock = virtio_vsock_get();
-	if (!vsock)
-		return -ENODEV;
-
-	src_cid	= virtio_transport_get_local_cid();
-	src_port = vsk->local_addr.svm_port;
-	if (!info->remote_cid) {
-		dst_cid	= vsk->remote_addr.svm_cid;
-		dst_port = vsk->remote_addr.svm_port;
-	} else {
-		dst_cid = info->remote_cid;
-		dst_port = info->remote_port;
-	}
-
-	trans = vsk->trans;
-	vq = vsock->vqs[VSOCK_VQ_TX];
-
-	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
-		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
-	pkt_len = virtio_transport_get_credit(trans, pkt_len);
-	/* Do not send zero length OP_RW pkt*/
-	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
-		return pkt_len;
-
-	/* Respect global tx buf limitation */
-	mutex_lock(&vsock->tx_lock);
-	while (pkt_len + vsock->total_tx_buf > VIRTIO_VSOCK_MAX_TX_BUF_SIZE) {
-		prepare_to_wait_exclusive(&vsock->queue_wait, &wait,
-					  TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&vsock->tx_lock);
-		schedule();
-		mutex_lock(&vsock->tx_lock);
-		finish_wait(&vsock->queue_wait, &wait);
-	}
-	vsock->total_tx_buf += pkt_len;
-	mutex_unlock(&vsock->tx_lock);
-
-	pkt = virtio_transport_alloc_pkt(vsk, info, pkt_len,
-					 src_cid, src_port,
-					 dst_cid, dst_port);
-	if (!pkt) {
-		mutex_lock(&vsock->tx_lock);
-		vsock->total_tx_buf -= pkt_len;
-		mutex_unlock(&vsock->tx_lock);
-		virtio_transport_put_credit(trans, pkt_len);
-		return -ENOMEM;
-	}
-
-	pr_debug("%s:info->pkt_len= %d\n", __func__, info->pkt_len);
-
-	/* Will be released in virtio_transport_send_pkt_work */
-	sock_hold(&trans->vsk->sk);
-	virtio_transport_inc_tx_pkt(pkt);
-
-	/* Put pkt in the virtqueue */
-	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
-	sgs[out_sg++] = &hdr;
-	if (info->msg && info->pkt_len > 0) {
-		sg_init_one(&buf, pkt->buf, pkt->len);
-	        sgs[out_sg++] = &buf;
-	}
-
-	mutex_lock(&vsock->tx_lock);
-	while ((ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt,
-					GFP_KERNEL)) < 0) {
-		prepare_to_wait_exclusive(&vsock->queue_wait, &wait,
-					  TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&vsock->tx_lock);
-		schedule();
-		mutex_lock(&vsock->tx_lock);
-		finish_wait(&vsock->queue_wait, &wait);
-	}
-	virtqueue_kick(vq);
-	mutex_unlock(&vsock->tx_lock);
-
-	return pkt_len;
-}
-
-static struct virtio_transport_pkt_ops virtio_ops = {
-	.send_pkt = virtio_transport_send_pkt,
-};
-
-static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
-{
-	int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
-	struct virtio_vsock_pkt *pkt;
-	struct scatterlist hdr, buf, *sgs[2];
-	struct virtqueue *vq;
-	int ret;
-
-	vq = vsock->vqs[VSOCK_VQ_RX];
-
-	do {
-		pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
-		if (!pkt) {
-			pr_debug("%s: fail to allocate pkt\n", __func__);
-			goto out;
-		}
-
-		/* TODO: use mergeable rx buffer */
-		pkt->buf = kmalloc(buf_len, GFP_KERNEL);
-		if (!pkt->buf) {
-			pr_debug("%s: fail to allocate pkt->buf\n", __func__);
-			goto err;
-		}
-
-		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
-		sgs[0] = &hdr;
-
-		sg_init_one(&buf, pkt->buf, buf_len);
-	        sgs[1] = &buf;
-		ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
-		if (ret)
-			goto err;
-		vsock->rx_buf_nr++;
-	} while (vq->num_free);
-	if (vsock->rx_buf_nr > vsock->rx_buf_max_nr)
-		vsock->rx_buf_max_nr = vsock->rx_buf_nr;
-out:
-	virtqueue_kick(vq);
-	return;
-err:
-	virtqueue_kick(vq);
-	virtio_transport_free_pkt(pkt);
-	return;
-}
-
-static void virtio_transport_send_pkt_work(struct work_struct *work)
-{
-	struct virtio_vsock *vsock =
-		container_of(work, struct virtio_vsock, tx_work);
-	struct virtio_vsock_pkt *pkt;
-	bool added = false;
-	struct virtqueue *vq;
-	unsigned int len;
-	struct sock *sk;
-
-	vq = vsock->vqs[VSOCK_VQ_TX];
-	mutex_lock(&vsock->tx_lock);
-	do {
-		virtqueue_disable_cb(vq);
-		while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) {
-			sk = &pkt->trans->vsk->sk;
-			virtio_transport_dec_tx_pkt(pkt);
-			/* Release refcnt taken in virtio_transport_send_pkt */
-			sock_put(sk);
-			vsock->total_tx_buf -= pkt->len;
-			virtio_transport_free_pkt(pkt);
-			added = true;
-		}
-	} while (!virtqueue_enable_cb(vq));
-	mutex_unlock(&vsock->tx_lock);
-
-	if (added)
-		wake_up(&vsock->queue_wait);
-}
-
-static void virtio_transport_recv_pkt_work(struct work_struct *work)
-{
-	struct virtio_vsock *vsock =
-		container_of(work, struct virtio_vsock, rx_work);
-	struct virtio_vsock_pkt *pkt;
-	struct virtqueue *vq;
-	unsigned int len;
-
-	vq = vsock->vqs[VSOCK_VQ_RX];
-	mutex_lock(&vsock->rx_lock);
-	do {
-		virtqueue_disable_cb(vq);
-		while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) {
-			pkt->len = len;
-			virtio_transport_recv_pkt(pkt);
-			vsock->rx_buf_nr--;
-		}
-	} while (!virtqueue_enable_cb(vq));
-
-	if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
-		virtio_vsock_rx_fill(vsock);
-	mutex_unlock(&vsock->rx_lock);
-}
-
-static void virtio_vsock_ctrl_done(struct virtqueue *vq)
-{
-}
-
-static void virtio_vsock_tx_done(struct virtqueue *vq)
-{
-	struct virtio_vsock *vsock = vq->vdev->priv;
-
-	if (!vsock)
-		return;
-	queue_work(virtio_vsock_workqueue, &vsock->tx_work);
-}
-
-static void virtio_vsock_rx_done(struct virtqueue *vq)
-{
-	struct virtio_vsock *vsock = vq->vdev->priv;
-
-	if (!vsock)
-		return;
-	queue_work(virtio_vsock_workqueue, &vsock->rx_work);
-}
-
-static int
-virtio_transport_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk)
-{
-	struct virtio_transport *trans;
-	int ret;
-
-	ret = virtio_transport_do_socket_init(vsk, psk);
-	if (ret)
-		return ret;
-
-	trans = vsk->trans;
-	trans->ops = &virtio_ops;
-	return ret;
-}
-
-static struct vsock_transport virtio_transport = {
-	.get_local_cid            = virtio_transport_get_local_cid,
-
-	.init                     = virtio_transport_socket_init,
-	.destruct                 = virtio_transport_destruct,
-	.release                  = virtio_transport_release,
-	.connect                  = virtio_transport_connect,
-	.shutdown                 = virtio_transport_shutdown,
-
-	.dgram_bind               = virtio_transport_dgram_bind,
-	.dgram_dequeue            = virtio_transport_dgram_dequeue,
-	.dgram_enqueue            = virtio_transport_dgram_enqueue,
-	.dgram_allow              = virtio_transport_dgram_allow,
-
-	.stream_dequeue           = virtio_transport_stream_dequeue,
-	.stream_enqueue           = virtio_transport_stream_enqueue,
-	.stream_has_data          = virtio_transport_stream_has_data,
-	.stream_has_space         = virtio_transport_stream_has_space,
-	.stream_rcvhiwat          = virtio_transport_stream_rcvhiwat,
-	.stream_is_active         = virtio_transport_stream_is_active,
-	.stream_allow             = virtio_transport_stream_allow,
-
-	.notify_poll_in           = virtio_transport_notify_poll_in,
-	.notify_poll_out          = virtio_transport_notify_poll_out,
-	.notify_recv_init         = virtio_transport_notify_recv_init,
-	.notify_recv_pre_block    = virtio_transport_notify_recv_pre_block,
-	.notify_recv_pre_dequeue  = virtio_transport_notify_recv_pre_dequeue,
-	.notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
-	.notify_send_init         = virtio_transport_notify_send_init,
-	.notify_send_pre_block    = virtio_transport_notify_send_pre_block,
-	.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
-	.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
-
-	.set_buffer_size          = virtio_transport_set_buffer_size,
-	.set_min_buffer_size      = virtio_transport_set_min_buffer_size,
-	.set_max_buffer_size      = virtio_transport_set_max_buffer_size,
-	.get_buffer_size          = virtio_transport_get_buffer_size,
-	.get_min_buffer_size      = virtio_transport_get_min_buffer_size,
-	.get_max_buffer_size      = virtio_transport_get_max_buffer_size,
-};
-
-static int virtio_vsock_probe(struct virtio_device *vdev)
-{
-	vq_callback_t *callbacks[] = {
-		virtio_vsock_ctrl_done,
-		virtio_vsock_rx_done,
-		virtio_vsock_tx_done,
-	};
-	const char *names[] = {
-		"ctrl",
-		"rx",
-		"tx",
-	};
-	struct virtio_vsock *vsock = NULL;
-	u32 guest_cid;
-	int ret;
-
-	ret = mutex_lock_interruptible(&the_virtio_vsock_mutex);
-	if (ret)
-		return ret;
-
-	/* Only one virtio-vsock device per guest is supported */
-	if (the_virtio_vsock) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	vsock = kzalloc(sizeof(*vsock), GFP_KERNEL);
-	if (!vsock) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	vsock->vdev = vdev;
-
-	ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
-					    vsock->vqs, callbacks, names);
-	if (ret < 0)
-		goto out;
-
-	vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
-			  &guest_cid, sizeof(guest_cid));
-	vsock->guest_cid = le32_to_cpu(guest_cid);
-	pr_debug("%s:guest_cid=%d\n", __func__, vsock->guest_cid);
-
-	ret = vsock_core_init(&virtio_transport);
-	if (ret < 0)
-		goto out_vqs;
-
-	vsock->rx_buf_nr = 0;
-	vsock->rx_buf_max_nr = 0;
-
-	vdev->priv = the_virtio_vsock = vsock;
-	init_waitqueue_head(&vsock->queue_wait);
-	mutex_init(&vsock->tx_lock);
-	mutex_init(&vsock->rx_lock);
-	INIT_WORK(&vsock->rx_work, virtio_transport_recv_pkt_work);
-	INIT_WORK(&vsock->tx_work, virtio_transport_send_pkt_work);
-
-	mutex_lock(&vsock->rx_lock);
-	virtio_vsock_rx_fill(vsock);
-	mutex_unlock(&vsock->rx_lock);
-
-	mutex_unlock(&the_virtio_vsock_mutex);
-	return 0;
-
-out_vqs:
-	vsock->vdev->config->del_vqs(vsock->vdev);
-out:
-	kfree(vsock);
-	mutex_unlock(&the_virtio_vsock_mutex);
-	return ret;
-}
-
-static void virtio_vsock_remove(struct virtio_device *vdev)
-{
-	struct virtio_vsock *vsock = vdev->priv;
-
-	mutex_lock(&the_virtio_vsock_mutex);
-	the_virtio_vsock = NULL;
-	vsock_core_exit();
-	mutex_unlock(&the_virtio_vsock_mutex);
-
-	kfree(vsock);
-}
-
-static struct virtio_device_id id_table[] = {
-	{ VIRTIO_ID_VSOCK, VIRTIO_DEV_ANY_ID },
-	{ 0 },
-};
-
-static unsigned int features[] = {
-};
-
-static struct virtio_driver virtio_vsock_driver = {
-	.feature_table = features,
-	.feature_table_size = ARRAY_SIZE(features),
-	.driver.name = KBUILD_MODNAME,
-	.driver.owner = THIS_MODULE,
-	.id_table = id_table,
-	.probe = virtio_vsock_probe,
-	.remove = virtio_vsock_remove,
-};
-
-static int __init virtio_vsock_init(void)
-{
-	int ret;
-
-	virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0);
-	if (!virtio_vsock_workqueue)
-		return -ENOMEM;
-	ret = register_virtio_driver(&virtio_vsock_driver);
-	if (ret)
-		destroy_workqueue(virtio_vsock_workqueue);
-	return ret;
-}
-
-static void __exit virtio_vsock_exit(void)
-{
-	unregister_virtio_driver(&virtio_vsock_driver);
-	destroy_workqueue(virtio_vsock_workqueue);
-}
-
-module_init(virtio_vsock_init);
-module_exit(virtio_vsock_exit);
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Asias He");
-MODULE_DESCRIPTION("virtio transport for vsock");
-MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
deleted file mode 100644
index 28f790da6f15..000000000000
--- a/net/vmw_vsock/virtio_transport_common.c
+++ /dev/null
@@ -1,1272 +0,0 @@
-/*
- * common code for virtio vsock
- *
- * Copyright (C) 2013-2015 Red Hat, Inc.
- * Author: Asias He <asias@redhat.com>
- *         Stefan Hajnoczi <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- */
-#include <linux/module.h>
-#include <linux/ctype.h>
-#include <linux/list.h>
-#include <linux/virtio.h>
-#include <linux/virtio_ids.h>
-#include <linux/virtio_config.h>
-#include <linux/virtio_vsock.h>
-#include <linux/random.h>
-#include <linux/cryptohash.h>
-
-#include <net/sock.h>
-#include <net/af_vsock.h>
-
-#define COOKIEBITS 24
-#define COOKIEMASK (((u32)1 << COOKIEBITS) - 1)
-#define VSOCK_TIMEOUT_INIT 4
-
-#define SHA_MESSAGE_WORDS 16
-#define SHA_VSOCK_WORDS 5
-
-static u32 vsockcookie_secret[2][SHA_MESSAGE_WORDS - SHA_VSOCK_WORDS +
-				 SHA_DIGEST_WORDS];
-
-static DEFINE_PER_CPU(__u32[SHA_MESSAGE_WORDS + SHA_DIGEST_WORDS +
-		      SHA_WORKSPACE_WORDS], vsock_cookie_scratch);
-
-static u32 cookie_hash(u32 saddr, u32 daddr, u16 sport, u16 dport,
-		       u32 count, int c)
-{
-	__u32 *tmp = this_cpu_ptr(vsock_cookie_scratch);
-
-	memcpy(tmp + SHA_VSOCK_WORDS, vsockcookie_secret[c],
-	       sizeof(vsockcookie_secret[c]));
-	tmp[0] = saddr;
-	tmp[1] = daddr;
-	tmp[2] = sport;
-	tmp[3] = dport;
-	tmp[4] = count;
-	sha_transform(tmp + SHA_MESSAGE_WORDS, (__u8 *)tmp,
-		      tmp + SHA_MESSAGE_WORDS + SHA_DIGEST_WORDS);
-
-	return tmp[17];
-}
-
-static u32
-virtio_vsock_secure_cookie(u32 saddr, u32 daddr, u32 sport, u32 dport,
-			   u32 count)
-{
-	u32 h1, h2;
-
-	h1 = cookie_hash(saddr, daddr, sport, dport, 0, 0);
-	h2 = cookie_hash(saddr, daddr, sport, dport, count, 1);
-
-	return h1 + (count << COOKIEBITS) + (h2 & COOKIEMASK);
-}
-
-static u32
-virtio_vsock_check_cookie(u32 saddr, u32 daddr, u32 sport, u32 dport,
-			  u32 count, u32 cookie, u32 maxdiff)
-{
-	u32 diff;
-	u32 ret;
-
-	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0);
-
-	diff = (count - (cookie >> COOKIEBITS)) & ((u32)-1 >> COOKIEBITS);
-	pr_debug("%s: diff=%x\n", __func__, diff);
-	if (diff >= maxdiff)
-		return (u32)-1;
-
-	ret = (cookie -
-		cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
-		& COOKIEMASK;
-	pr_debug("%s: ret=%x\n", __func__, diff);
-
-	return ret;
-}
-
-void virtio_vsock_dumppkt(const char *func,  const struct virtio_vsock_pkt *pkt)
-{
-	pr_debug("%s: pkt=%p, op=%d, len=%d, %d:%d---%d:%d, len=%d\n",
-		 func, pkt,
-		 le16_to_cpu(pkt->hdr.op),
-		 le32_to_cpu(pkt->hdr.len),
-		 le32_to_cpu(pkt->hdr.src_cid),
-		 le32_to_cpu(pkt->hdr.src_port),
-		 le32_to_cpu(pkt->hdr.dst_cid),
-		 le32_to_cpu(pkt->hdr.dst_port),
-		 pkt->len);
-}
-EXPORT_SYMBOL_GPL(virtio_vsock_dumppkt);
-
-struct virtio_vsock_pkt *
-virtio_transport_alloc_pkt(struct vsock_sock *vsk,
-			   struct virtio_vsock_pkt_info *info,
-			   size_t len,
-			   u32 src_cid,
-			   u32 src_port,
-			   u32 dst_cid,
-			   u32 dst_port)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt *pkt;
-	int err;
-
-	BUG_ON(!trans);
-
-	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
-	if (!pkt)
-		return NULL;
-
-	pkt->hdr.type		= cpu_to_le16(info->type);
-	pkt->hdr.op		= cpu_to_le16(info->op);
-	pkt->hdr.src_cid	= cpu_to_le32(src_cid);
-	pkt->hdr.src_port	= cpu_to_le32(src_port);
-	pkt->hdr.dst_cid	= cpu_to_le32(dst_cid);
-	pkt->hdr.dst_port	= cpu_to_le32(dst_port);
-	pkt->hdr.flags		= cpu_to_le32(info->flags);
-	pkt->len		= len;
-	pkt->trans		= trans;
-	if (info->type == VIRTIO_VSOCK_TYPE_DGRAM)
-		pkt->hdr.len	= cpu_to_le32(len + (info->dgram_len << 16));
-	else if (info->type == VIRTIO_VSOCK_TYPE_STREAM)
-		pkt->hdr.len	= cpu_to_le32(len);
-
-	if (info->msg && len > 0) {
-		pkt->buf = kmalloc(len, GFP_KERNEL);
-		if (!pkt->buf)
-			goto out_pkt;
-		err = memcpy_from_msg(pkt->buf, info->msg, len);
-		if (err)
-			goto out;
-	}
-
-	return pkt;
-
-out:
-	kfree(pkt->buf);
-out_pkt:
-	kfree(pkt);
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
-
-struct sock *
-virtio_transport_get_pending(struct sock *listener,
-			     struct virtio_vsock_pkt *pkt)
-{
-	struct vsock_sock *vlistener;
-	struct vsock_sock *vpending;
-	struct sockaddr_vm src;
-	struct sockaddr_vm dst;
-	struct sock *pending;
-
-	vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port));
-	vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), le32_to_cpu(pkt->hdr.dst_port));
-
-	vlistener = vsock_sk(listener);
-	list_for_each_entry(vpending, &vlistener->pending_links,
-			    pending_links) {
-		if (vsock_addr_equals_addr(&src, &vpending->remote_addr) &&
-		    vsock_addr_equals_addr(&dst, &vpending->local_addr)) {
-			pending = sk_vsock(vpending);
-			sock_hold(pending);
-			return pending;
-		}
-	}
-
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_pending);
-
-static void virtio_transport_inc_rx_pkt(struct virtio_vsock_pkt *pkt)
-{
-	pkt->trans->rx_bytes += pkt->len;
-}
-
-static void virtio_transport_dec_rx_pkt(struct virtio_vsock_pkt *pkt)
-{
-	pkt->trans->rx_bytes -= pkt->len;
-	pkt->trans->fwd_cnt += pkt->len;
-}
-
-void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt)
-{
-	mutex_lock(&pkt->trans->tx_lock);
-	pkt->hdr.fwd_cnt = cpu_to_le32(pkt->trans->fwd_cnt);
-	pkt->hdr.buf_alloc = cpu_to_le32(pkt->trans->buf_alloc);
-	mutex_unlock(&pkt->trans->tx_lock);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
-
-void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt)
-{
-}
-EXPORT_SYMBOL_GPL(virtio_transport_dec_tx_pkt);
-
-u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 credit)
-{
-	u32 ret;
-
-	mutex_lock(&trans->tx_lock);
-	ret = trans->peer_buf_alloc - (trans->tx_cnt - trans->peer_fwd_cnt);
-	if (ret > credit)
-		ret = credit;
-	trans->tx_cnt += ret;
-	mutex_unlock(&trans->tx_lock);
-
-	pr_debug("%s: ret=%d, buf_alloc=%d, peer_buf_alloc=%d,"
-		 "tx_cnt=%d, fwd_cnt=%d, peer_fwd_cnt=%d\n", __func__,
-		 ret, trans->buf_alloc, trans->peer_buf_alloc,
-		 trans->tx_cnt, trans->fwd_cnt, trans->peer_fwd_cnt);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_credit);
-
-void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit)
-{
-	mutex_lock(&trans->tx_lock);
-	trans->tx_cnt -= credit;
-	mutex_unlock(&trans->tx_lock);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
-
-static int virtio_transport_send_credit_update(struct vsock_sock *vsk, int type, struct virtio_vsock_hdr *hdr)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt_info info = {
-		.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
-		.type = type,
-	};
-
-	if (hdr && type == VIRTIO_VSOCK_TYPE_DGRAM) {
-		info.remote_cid = le32_to_cpu(hdr->src_cid);
-		info.remote_port = le32_to_cpu(hdr->src_port);
-	}
-
-	pr_debug("%s: sk=%p send_credit_update\n", __func__, vsk);
-	return trans->ops->send_pkt(vsk, &info);
-}
-
-static int virtio_transport_send_credit_request(struct vsock_sock *vsk, int type)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt_info info = {
-		.op = VIRTIO_VSOCK_OP_CREDIT_REQUEST,
-		.type = type,
-	};
-
-	pr_debug("%s: sk=%p send_credit_request\n", __func__, vsk);
-	return trans->ops->send_pkt(vsk, &info);
-}
-
-static ssize_t
-virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
-				   struct msghdr *msg,
-				   size_t len)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt *pkt;
-	size_t bytes, total = 0;
-	int err = -EFAULT;
-
-	mutex_lock(&trans->rx_lock);
-	while (total < len && trans->rx_bytes > 0  &&
-			!list_empty(&trans->rx_queue)) {
-		pkt = list_first_entry(&trans->rx_queue,
-				       struct virtio_vsock_pkt, list);
-
-		bytes = len - total;
-		if (bytes > pkt->len - pkt->off)
-			bytes = pkt->len - pkt->off;
-
-		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
-		if (err)
-			goto out;
-		total += bytes;
-		pkt->off += bytes;
-		if (pkt->off == pkt->len) {
-			virtio_transport_dec_rx_pkt(pkt);
-			list_del(&pkt->list);
-			virtio_transport_free_pkt(pkt);
-		}
-	}
-	mutex_unlock(&trans->rx_lock);
-
-	/* Send a credit pkt to peer */
-	virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
-					    NULL);
-
-	return total;
-
-out:
-	mutex_unlock(&trans->rx_lock);
-	if (total)
-		err = total;
-	return err;
-}
-
-ssize_t
-virtio_transport_stream_dequeue(struct vsock_sock *vsk,
-				struct msghdr *msg,
-				size_t len, int flags)
-{
-	if (flags & MSG_PEEK)
-		return -EOPNOTSUPP;
-
-	return virtio_transport_stream_do_dequeue(vsk, msg, len);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
-
-struct dgram_skb {
-	struct list_head list;
-	struct sk_buff *skb;
-	u16 id;
-};
-
-static struct dgram_skb *dgram_id_to_skb(struct virtio_transport *trans,
-					 u16 id)
-{
-	struct dgram_skb *dgram_skb;
-
-	list_for_each_entry(dgram_skb, &trans->incomplete_dgrams, list) {
-		if (dgram_skb->id == id)
-			return dgram_skb;
-	}
-
-	return NULL;
-}
-
-static void
-virtio_transport_recv_dgram(struct sock *sk,
-			    struct virtio_vsock_pkt *pkt)
-{
-	struct sk_buff *skb = NULL;
-	struct vsock_sock *vsk;
-	struct virtio_transport *trans;
-	size_t size;
-	u16 dgram_id, pkt_off, dgram_len, pkt_len;
-	u32 flags, len;
-	struct dgram_skb *dgram_skb;
-
-	vsk = vsock_sk(sk);
-	trans = vsk->trans;
-
-	/* len:   dgram_len | pkt_len */
-	len = le32_to_cpu(pkt->hdr.len);
-	dgram_len = len >> 16;
-	pkt_len = len & 0xFFFF;
-
-	/* flags: dgram_id  | pkt_off */
-	flags = le32_to_cpu(pkt->hdr.flags);
-	dgram_id = flags >> 16;
-	pkt_off = flags & 0xFFFF;
-
-	pr_debug("%s: dgram_len=%d, pkt_len=%d, id=%d, off=%d\n", __func__,
-		 dgram_len, pkt_len, dgram_id, pkt_off);
-
-	dgram_skb = dgram_id_to_skb(trans, dgram_id);
-	if (dgram_skb) {
-		/* This pkt is for a existing dgram */
-		skb = dgram_skb->skb;
-		pr_debug("%s:found skb\n", __func__);
-	}
-
-	/* Packet payload must be within datagram bounds */
-	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
-		goto drop;
-	if (pkt_len > dgram_len)
-		goto drop;
-	if (pkt_off > dgram_len)
-		goto drop;
-	if (dgram_len - pkt_off < pkt_len)
-		goto drop;
-
-	if (!skb) {
-		/* This pkt is for a new dgram */
-		pr_debug("%s:create skb\n", __func__);
-
-		size = sizeof(pkt->hdr) + dgram_len;
-		/* Attach the packet to the socket's receive queue as an sk_buff. */
-		dgram_skb = kzalloc(sizeof(struct dgram_skb), GFP_ATOMIC);
-		if (!dgram_skb)
-			goto drop;
-
-		skb = alloc_skb(size, GFP_ATOMIC);
-		if (!skb) {
-			kfree(dgram_skb);
-			dgram_skb = NULL;
-			goto drop;
-		}
-		dgram_skb->id = dgram_id;
-		dgram_skb->skb = skb;
-		list_add_tail(&dgram_skb->list, &trans->incomplete_dgrams);
-
-		/* sk_receive_skb() will do a sock_put(), so hold here. */
-		sock_hold(sk);
-		skb_put(skb, size);
-		memcpy(skb->data, &pkt->hdr, sizeof(pkt->hdr));
-	}
-
-	memcpy(skb->data + sizeof(pkt->hdr) + pkt_off, pkt->buf, pkt_len);
-
-	pr_debug("%s:C, off=%d, pkt_len=%d, dgram_len=%d\n", __func__,
-		 pkt_off, pkt_len, dgram_len);
-
-	/* We are done with this dgram */
-	if (pkt_off + pkt_len == dgram_len) {
-		pr_debug("%s:dgram_id=%d is done\n", __func__, dgram_id);
-		list_del(&dgram_skb->list);
-		kfree(dgram_skb);
-		sk_receive_skb(sk, skb, 0);
-	}
-	virtio_transport_free_pkt(pkt);
-	return;
-
-drop:
-	if (dgram_skb) {
-		list_del(&dgram_skb->list);
-		kfree(dgram_skb);
-		kfree_skb(skb);
-		sock_put(sk);
-	}
-	virtio_transport_free_pkt(pkt);
-}
-
-int
-virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
-			       struct msghdr *msg,
-			       size_t len, int flags)
-{
-	struct virtio_vsock_hdr *hdr;
-	struct sk_buff *skb;
-	int noblock;
-	int err;
-	int dgram_len;
-
-	noblock = flags & MSG_DONTWAIT;
-
-	if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
-		return -EOPNOTSUPP;
-
-	/* Retrieve the head sk_buff from the socket's receive queue. */
-	err = 0;
-	skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err);
-	if (err)
-		return err;
-	if (!skb)
-		return -EAGAIN;
-
-	hdr = (struct virtio_vsock_hdr *)skb->data;
-	if (!hdr)
-		goto out;
-
-	dgram_len = le32_to_cpu(hdr->len) >> 16;
-	/* Place the datagram payload in the user's iovec. */
-	err = skb_copy_datagram_msg(skb, sizeof(*hdr), msg, dgram_len);
-	if (err)
-		goto out;
-
-	if (msg->msg_name) {
-		/* Provide the address of the sender. */
-		DECLARE_SOCKADDR(struct sockaddr_vm *, vm_addr, msg->msg_name);
-		vsock_addr_init(vm_addr, le32_to_cpu(hdr->src_cid), le32_to_cpu(hdr->src_port));
-		msg->msg_namelen = sizeof(*vm_addr);
-	}
-	err = dgram_len;
-
-	/* Send a credit pkt to peer */
-	virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_DGRAM, hdr);
-
-	pr_debug("%s:done, recved =%d\n", __func__, dgram_len);
-out:
-	skb_free_datagram(&vsk->sk, skb);
-	return err;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue);
-
-s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
-{
-	struct virtio_transport *trans = vsk->trans;
-	s64 bytes;
-
-	mutex_lock(&trans->rx_lock);
-	bytes = trans->rx_bytes;
-	mutex_unlock(&trans->rx_lock);
-
-	return bytes;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
-
-static s64 virtio_transport_has_space(struct vsock_sock *vsk)
-{
-	struct virtio_transport *trans = vsk->trans;
-	s64 bytes;
-
-	bytes = trans->peer_buf_alloc - (trans->tx_cnt - trans->peer_fwd_cnt);
-	if (bytes < 0)
-		bytes = 0;
-
-	return bytes;
-}
-
-s64 virtio_transport_stream_has_space(struct vsock_sock *vsk)
-{
-	struct virtio_transport *trans = vsk->trans;
-	s64 bytes;
-
-	mutex_lock(&trans->tx_lock);
-	bytes = virtio_transport_has_space(vsk);
-	mutex_unlock(&trans->tx_lock);
-
-	pr_debug("%s: bytes=%lld\n", __func__, bytes);
-
-	return bytes;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space);
-
-int virtio_transport_do_socket_init(struct vsock_sock *vsk,
-				    struct vsock_sock *psk)
-{
-	struct virtio_transport *trans;
-
-	trans = kzalloc(sizeof(*trans), GFP_KERNEL);
-	if (!trans)
-		return -ENOMEM;
-
-	vsk->trans = trans;
-	trans->vsk = vsk;
-	if (psk) {
-		struct virtio_transport *ptrans = psk->trans;
-		trans->buf_size	= ptrans->buf_size;
-		trans->buf_size_min = ptrans->buf_size_min;
-		trans->buf_size_max = ptrans->buf_size_max;
-		trans->peer_buf_alloc = ptrans->peer_buf_alloc;
-	} else {
-		trans->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
-		trans->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
-		trans->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
-	}
-
-	trans->buf_alloc = trans->buf_size;
-
-	pr_debug("%s: trans->buf_alloc=%d\n", __func__, trans->buf_alloc);
-
-	mutex_init(&trans->rx_lock);
-	mutex_init(&trans->tx_lock);
-	INIT_LIST_HEAD(&trans->rx_queue);
-	INIT_LIST_HEAD(&trans->incomplete_dgrams);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
-
-u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
-{
-	struct virtio_transport *trans = vsk->trans;
-
-	return trans->buf_size;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
-
-u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
-{
-	struct virtio_transport *trans = vsk->trans;
-
-	return trans->buf_size_min;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
-
-u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
-{
-	struct virtio_transport *trans = vsk->trans;
-
-	return trans->buf_size_max;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
-
-void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
-{
-	struct virtio_transport *trans = vsk->trans;
-
-	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
-		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
-	if (val < trans->buf_size_min)
-		trans->buf_size_min = val;
-	if (val > trans->buf_size_max)
-		trans->buf_size_max = val;
-	trans->buf_size = val;
-	trans->buf_alloc = val;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
-
-void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
-{
-	struct virtio_transport *trans = vsk->trans;
-
-	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
-		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
-	if (val > trans->buf_size)
-		trans->buf_size = val;
-	trans->buf_size_min = val;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
-
-void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
-{
-	struct virtio_transport *trans = vsk->trans;
-
-	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
-		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
-	if (val < trans->buf_size)
-		trans->buf_size = val;
-	trans->buf_size_max = val;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
-
-int
-virtio_transport_notify_poll_in(struct vsock_sock *vsk,
-				size_t target,
-				bool *data_ready_now)
-{
-	if (vsock_stream_has_data(vsk))
-		*data_ready_now = true;
-	else
-		*data_ready_now = false;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in);
-
-int
-virtio_transport_notify_poll_out(struct vsock_sock *vsk,
-				 size_t target,
-				 bool *space_avail_now)
-{
-	s64 free_space;
-
-	free_space = vsock_stream_has_space(vsk);
-	if (free_space > 0)
-		*space_avail_now = true;
-	else if (free_space == 0)
-		*space_avail_now = false;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out);
-
-int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
-	size_t target, struct vsock_transport_recv_notify_data *data)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init);
-
-int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
-	size_t target, struct vsock_transport_recv_notify_data *data)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block);
-
-int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
-	size_t target, struct vsock_transport_recv_notify_data *data)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue);
-
-int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
-	size_t target, ssize_t copied, bool data_read,
-	struct vsock_transport_recv_notify_data *data)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue);
-
-int virtio_transport_notify_send_init(struct vsock_sock *vsk,
-	struct vsock_transport_send_notify_data *data)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init);
-
-int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
-	struct vsock_transport_send_notify_data *data)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block);
-
-int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
-	struct vsock_transport_send_notify_data *data)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue);
-
-int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
-	ssize_t written, struct vsock_transport_send_notify_data *data)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
-
-u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
-{
-	struct virtio_transport *trans = vsk->trans;
-
-	return trans->buf_size;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
-
-bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
-{
-	return true;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
-
-bool virtio_transport_stream_allow(u32 cid, u32 port)
-{
-	return true;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
-
-int virtio_transport_dgram_bind(struct vsock_sock *vsk,
-				struct sockaddr_vm *addr)
-{
-	return vsock_bind_dgram_generic(vsk, addr);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
-
-bool virtio_transport_dgram_allow(u32 cid, u32 port)
-{
-	return true;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
-
-int virtio_transport_connect(struct vsock_sock *vsk)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt_info info = {
-		.op = VIRTIO_VSOCK_OP_REQUEST,
-		.type = VIRTIO_VSOCK_TYPE_STREAM,
-	};
-
-	pr_debug("%s: vsk=%p send_request\n", __func__, vsk);
-	return trans->ops->send_pkt(vsk, &info);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_connect);
-
-int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt_info info = {
-		.op = VIRTIO_VSOCK_OP_SHUTDOWN,
-		.type = VIRTIO_VSOCK_TYPE_STREAM,
-		.flags = (mode & RCV_SHUTDOWN ?
-			  VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
-			 (mode & SEND_SHUTDOWN ?
-			  VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
-	};
-
-	pr_debug("%s: vsk=%p: send_shutdown\n", __func__, vsk);
-	return trans->ops->send_pkt(vsk, &info);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_shutdown);
-
-void virtio_transport_release(struct vsock_sock *vsk)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct sock *sk = &vsk->sk;
-	struct dgram_skb *dgram_skb;
-	struct dgram_skb *dgram_skb_tmp;
-
-	pr_debug("%s: vsk=%p\n", __func__, vsk);
-
-	/* Tell other side to terminate connection */
-	if (sk->sk_type == SOCK_STREAM && sk->sk_state == SS_CONNECTED) {
-		virtio_transport_shutdown(vsk, SHUTDOWN_MASK);
-	}
-
-	/* Free incomplete dgrams */
-	lock_sock(sk);
-	list_for_each_entry_safe(dgram_skb, dgram_skb_tmp,
-				 &trans->incomplete_dgrams, list) {
-		list_del(&dgram_skb->list);
-		kfree_skb(dgram_skb->skb);
-		kfree(dgram_skb);
-		sock_put(sk); /* held in virtio_transport_recv_dgram() */
-	}
-	release_sock(sk);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_release);
-
-int
-virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
-			       struct sockaddr_vm *remote_addr,
-			       struct msghdr *msg,
-			       size_t dgram_len)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt_info info = {
-		.op = VIRTIO_VSOCK_OP_RW,
-		.type = VIRTIO_VSOCK_TYPE_DGRAM,
-		.msg = msg,
-	};
-	size_t total_written = 0, pkt_off = 0, written;
-	u16 dgram_id;
-
-	/* The max size of a single dgram we support is 64KB */
-	if (dgram_len > VIRTIO_VSOCK_MAX_DGRAM_SIZE)
-		return -EMSGSIZE;
-
-	info.dgram_len = dgram_len;
-	vsk->remote_addr = *remote_addr;
-
-	dgram_id = trans->dgram_id++;
-
-	/* TODO: To optimize, if we have enough credit to send the pkt already,
-	 * do not ask the peer to send credit to use  */
-	virtio_transport_send_credit_request(vsk, VIRTIO_VSOCK_TYPE_DGRAM);
-
-	while (total_written < dgram_len) {
-		info.pkt_len = dgram_len - total_written;
-		info.flags = dgram_id << 16 | pkt_off;
-		written = trans->ops->send_pkt(vsk, &info);
-		if (written < 0)
-			return -ENOMEM;
-		if (written == 0) {
-			/* TODO: if written = 0, we need a sleep & wakeup
-			 * instead of sleep */
-			pr_debug("%s: SHOULD WAIT written==0", __func__);
-			msleep(10);
-		}
-		total_written += written;
-		pkt_off += written;
-		pr_debug("%s:id=%d, dgram_len=%zu, off=%zu, total_written=%zu, written=%zu\n",
-			  __func__, dgram_id, dgram_len, pkt_off, total_written, written);
-	}
-
-	return dgram_len;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue);
-
-ssize_t
-virtio_transport_stream_enqueue(struct vsock_sock *vsk,
-				struct msghdr *msg,
-				size_t len)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt_info info = {
-		.op = VIRTIO_VSOCK_OP_RW,
-		.type = VIRTIO_VSOCK_TYPE_STREAM,
-		.msg = msg,
-		.pkt_len = len,
-	};
-
-	return trans->ops->send_pkt(vsk, &info);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue);
-
-void virtio_transport_destruct(struct vsock_sock *vsk)
-{
-	struct virtio_transport *trans = vsk->trans;
-
-	pr_debug("%s: vsk=%p\n", __func__, vsk);
-	kfree(trans);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_destruct);
-
-static int virtio_transport_send_ack(struct vsock_sock *vsk, u32 cookie)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt_info info = {
-		.op = VIRTIO_VSOCK_OP_ACK,
-		.type = VIRTIO_VSOCK_TYPE_STREAM,
-		.flags = cpu_to_le32(cookie),
-	};
-
-	pr_debug("%s: sk=%p send_offer\n", __func__, vsk);
-	return trans->ops->send_pkt(vsk, &info);
-}
-
-static int virtio_transport_send_reset(struct vsock_sock *vsk,
-				       struct virtio_vsock_pkt *pkt)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt_info info = {
-		.op = VIRTIO_VSOCK_OP_RST,
-		.type = VIRTIO_VSOCK_TYPE_STREAM,
-	};
-
-	pr_debug("%s\n", __func__);
-
-	/* Send RST only if the original pkt is not a RST pkt */
-	if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
-		return 0;
-
-	return trans->ops->send_pkt(vsk, &info);
-}
-
-static int
-virtio_transport_recv_connecting(struct sock *sk,
-				 struct virtio_vsock_pkt *pkt)
-{
-	struct vsock_sock *vsk = vsock_sk(sk);
-	int err;
-	int skerr;
-	u32 cookie;
-
-	pr_debug("%s: vsk=%p\n", __func__, vsk);
-	switch (le16_to_cpu(pkt->hdr.op)) {
-	case VIRTIO_VSOCK_OP_RESPONSE:
-		cookie = le32_to_cpu(pkt->hdr.flags);
-		pr_debug("%s: got RESPONSE and send ACK, cookie=%x\n", __func__, cookie);
-		err = virtio_transport_send_ack(vsk, cookie);
-		if (err < 0) {
-			skerr = -err;
-			goto destroy;
-		}
-		sk->sk_state = SS_CONNECTED;
-		sk->sk_socket->state = SS_CONNECTED;
-		vsock_insert_connected(vsk);
-		sk->sk_state_change(sk);
-		break;
-	case VIRTIO_VSOCK_OP_INVALID:
-		pr_debug("%s: got invalid\n", __func__);
-		break;
-	case VIRTIO_VSOCK_OP_RST:
-		pr_debug("%s: got rst\n", __func__);
-		skerr = ECONNRESET;
-		err = 0;
-		goto destroy;
-	default:
-		pr_debug("%s: got def\n", __func__);
-		skerr = EPROTO;
-		err = -EINVAL;
-		goto destroy;
-	}
-	return 0;
-
-destroy:
-	virtio_transport_send_reset(vsk, pkt);
-	sk->sk_state = SS_UNCONNECTED;
-	sk->sk_err = skerr;
-	sk->sk_error_report(sk);
-	return err;
-}
-
-static int
-virtio_transport_recv_connected(struct sock *sk,
-				struct virtio_vsock_pkt *pkt)
-{
-	struct vsock_sock *vsk = vsock_sk(sk);
-	struct virtio_transport *trans = vsk->trans;
-	int err = 0;
-
-	switch (le16_to_cpu(pkt->hdr.op)) {
-	case VIRTIO_VSOCK_OP_RW:
-		pkt->len = le32_to_cpu(pkt->hdr.len);
-		pkt->off = 0;
-		pkt->trans = trans;
-
-		mutex_lock(&trans->rx_lock);
-		virtio_transport_inc_rx_pkt(pkt);
-		list_add_tail(&pkt->list, &trans->rx_queue);
-		mutex_unlock(&trans->rx_lock);
-
-		sk->sk_data_ready(sk);
-		return err;
-	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
-		sk->sk_write_space(sk);
-		break;
-	case VIRTIO_VSOCK_OP_SHUTDOWN:
-		pr_debug("%s: got shutdown\n", __func__);
-		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
-			vsk->peer_shutdown |= RCV_SHUTDOWN;
-		if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
-			vsk->peer_shutdown |= SEND_SHUTDOWN;
-		if (le32_to_cpu(pkt->hdr.flags))
-			sk->sk_state_change(sk);
-		break;
-	case VIRTIO_VSOCK_OP_RST:
-		pr_debug("%s: got rst\n", __func__);
-		sock_set_flag(sk, SOCK_DONE);
-		vsk->peer_shutdown = SHUTDOWN_MASK;
-		if (vsock_stream_has_data(vsk) <= 0)
-			sk->sk_state = SS_DISCONNECTING;
-		sk->sk_state_change(sk);
-		break;
-	default:
-		err = -EINVAL;
-		break;
-	}
-
-	virtio_transport_free_pkt(pkt);
-	return err;
-}
-
-static int
-virtio_transport_send_response(struct vsock_sock *vsk,
-			       struct virtio_vsock_pkt *pkt)
-{
-	struct virtio_transport *trans = vsk->trans;
-	struct virtio_vsock_pkt_info info = {
-		.op = VIRTIO_VSOCK_OP_RESPONSE,
-		.type = VIRTIO_VSOCK_TYPE_STREAM,
-		.remote_cid = le32_to_cpu(pkt->hdr.src_cid),
-		.remote_port = le32_to_cpu(pkt->hdr.src_port),
-	};
-	u32 cookie;
-
-	cookie = virtio_vsock_secure_cookie(le32_to_cpu(pkt->hdr.src_cid),
-					    le32_to_cpu(pkt->hdr.dst_cid),
-					    le32_to_cpu(pkt->hdr.src_port),
-					    le32_to_cpu(pkt->hdr.dst_port),
-					    jiffies / (HZ * 60));
-	info.flags = cpu_to_le32(cookie);
-
-	pr_debug("%s: send_response, cookie=%x\n", __func__, le32_to_cpu(cookie));
-
-	return trans->ops->send_pkt(vsk, &info);
-}
-
-/* Handle server socket */
-static int
-virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
-{
-	struct vsock_sock *vsk = vsock_sk(sk);
-	struct vsock_sock *vpending;
-	struct sock *pending;
-	int err;
-	u32 cookie;
-
-	switch (le16_to_cpu(pkt->hdr.op)) {
-	case VIRTIO_VSOCK_OP_REQUEST:
-		err = virtio_transport_send_response(vsk, pkt);
-		if (err < 0) {
-			// FIXME vsk should be vpending
-			virtio_transport_send_reset(vsk, pkt);
-			return err;
-		}
-		break;
-	case VIRTIO_VSOCK_OP_ACK:
-		cookie = le32_to_cpu(pkt->hdr.flags);
-		err = virtio_vsock_check_cookie(le32_to_cpu(pkt->hdr.src_cid),
-						le32_to_cpu(pkt->hdr.dst_cid),
-						le32_to_cpu(pkt->hdr.src_port),
-						le32_to_cpu(pkt->hdr.dst_port),
-						jiffies / (HZ * 60),
-						le32_to_cpu(pkt->hdr.flags),
-						VSOCK_TIMEOUT_INIT);
-		pr_debug("%s: cookie=%x, err=%d\n", __func__, cookie, err);
-		if (err)
-			return err;
-
-		/* So no pending socket are responsible for this pkt, create one */
-		pr_debug("%s: create pending\n", __func__);
-		pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
-				sk->sk_type, 0);
-		if (!pending) {
-			virtio_transport_send_reset(vsk, pkt);
-			return -ENOMEM;
-		}
-		sk->sk_ack_backlog++;
-		pending->sk_state = SS_CONNECTING;
-
-		vpending = vsock_sk(pending);
-		vsock_addr_init(&vpending->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
-				le32_to_cpu(pkt->hdr.dst_port));
-		vsock_addr_init(&vpending->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
-				le32_to_cpu(pkt->hdr.src_port));
-		vsock_add_pending(sk, pending);
-
-		pr_debug("%s: get pending\n", __func__);
-		pending = virtio_transport_get_pending(sk, pkt);
-		vpending = vsock_sk(pending);
-		lock_sock(pending);
-		switch (pending->sk_state) {
-			case SS_CONNECTING:
-				if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_ACK) {
-					pr_debug("%s: op=%d != OP_ACK\n", __func__,
-						 le16_to_cpu(pkt->hdr.op));
-					virtio_transport_send_reset(vpending, pkt);
-					pending->sk_err = EPROTO;
-					pending->sk_state = SS_UNCONNECTED;
-					sock_put(pending);
-				} else {
-					pending->sk_state = SS_CONNECTED;
-					vsock_insert_connected(vpending);
-
-					vsock_remove_pending(sk, pending);
-					vsock_enqueue_accept(sk, pending);
-
-					sk->sk_data_ready(sk);
-				}
-				err = 0;
-				break;
-			default:
-				pr_debug("%s: sk->sk_ack_backlog=%d\n", __func__,
-						sk->sk_ack_backlog);
-				virtio_transport_send_reset(vpending, pkt);
-				err = -EINVAL;
-				break;
-		}
-		if (err < 0)
-			vsock_remove_pending(sk, pending);
-		release_sock(pending);
-
-		/* Release refcnt obtained in virtio_transport_get_pending */
-		sock_put(pending);
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-static void virtio_transport_space_update(struct sock *sk,
-					  struct virtio_vsock_pkt *pkt)
-{
-	struct vsock_sock *vsk = vsock_sk(sk);
-	struct virtio_transport *trans = vsk->trans;
-	bool space_available;
-
-	/* buf_alloc and fwd_cnt is always included in the hdr */
-	mutex_lock(&trans->tx_lock);
-	trans->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
-	trans->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
-	space_available = virtio_transport_has_space(vsk);
-	mutex_unlock(&trans->tx_lock);
-
-	if (space_available)
-		sk->sk_write_space(sk);
-}
-
-/* We are under the virtio-vsock's vsock->rx_lock or
- * vhost-vsock's vq->mutex lock */
-void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
-{
-	struct virtio_transport *trans;
-	struct sockaddr_vm src, dst;
-	struct vsock_sock *vsk;
-	struct sock *sk;
-
-	vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), le32_to_cpu(pkt->hdr.src_port));
-	vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), le32_to_cpu(pkt->hdr.dst_port));
-
-	virtio_vsock_dumppkt(__func__, pkt);
-
-	if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_DGRAM) {
-		sk = vsock_find_unbound_socket(&dst);
-		if (!sk)
-			goto free_pkt;
-
-		vsk = vsock_sk(sk);
-		trans = vsk->trans;
-		BUG_ON(!trans);
-
-		virtio_transport_space_update(sk, pkt);
-
-		lock_sock(sk);
-		switch (le16_to_cpu(pkt->hdr.op)) {
-		case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
-			virtio_transport_free_pkt(pkt);
-			break;
-		case VIRTIO_VSOCK_OP_CREDIT_REQUEST:
-			virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_DGRAM,
-							    &pkt->hdr);
-			virtio_transport_free_pkt(pkt);
-			break;
-		case VIRTIO_VSOCK_OP_RW:
-			virtio_transport_recv_dgram(sk, pkt);
-			break;
-		default:
-			virtio_transport_free_pkt(pkt);
-			break;
-		}
-		release_sock(sk);
-
-		/* Release refcnt obtained when we fetched this socket out of
-		 * the unbound list.
-		 */
-		sock_put(sk);
-		return;
-	} else if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM) {
-		/* The socket must be in connected or bound table
-		 * otherwise send reset back
-		 */
-		sk = vsock_find_connected_socket(&src, &dst);
-		if (!sk) {
-			sk = vsock_find_bound_socket(&dst);
-			if (!sk) {
-				pr_debug("%s: can not find bound_socket\n", __func__);
-				virtio_vsock_dumppkt(__func__, pkt);
-				/* Ignore this pkt instead of sending reset back */
-				/* TODO send a RST unless this packet is a RST (to avoid infinite loops) */
-				goto free_pkt;
-			}
-		}
-
-		vsk = vsock_sk(sk);
-		trans = vsk->trans;
-		BUG_ON(!trans);
-
-		virtio_transport_space_update(sk, pkt);
-
-		lock_sock(sk);
-		switch (sk->sk_state) {
-		case VSOCK_SS_LISTEN:
-			virtio_transport_recv_listen(sk, pkt);
-			virtio_transport_free_pkt(pkt);
-			break;
-		case SS_CONNECTING:
-			virtio_transport_recv_connecting(sk, pkt);
-			virtio_transport_free_pkt(pkt);
-			break;
-		case SS_CONNECTED:
-			virtio_transport_recv_connected(sk, pkt);
-			break;
-		default:
-			virtio_transport_free_pkt(pkt);
-			break;
-		}
-		release_sock(sk);
-
-		/* Release refcnt obtained when we fetched this socket out of the
-		 * bound or connected list.
-		 */
-		sock_put(sk);
-	}
-	return;
-
-free_pkt:
-	virtio_transport_free_pkt(pkt);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
-
-void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
-{
-	kfree(pkt->buf);
-	kfree(pkt);
-}
-EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
-
-static int __init virtio_vsock_common_init(void)
-{
-	get_random_bytes(vsockcookie_secret, sizeof(vsockcookie_secret));
-	return 0;
-}
-
-static void __exit virtio_vsock_common_exit(void)
-{
-}
-
-module_init(virtio_vsock_common_init);
-module_exit(virtio_vsock_common_exit);
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Asias He");
-MODULE_DESCRIPTION("common code for virtio vsock");
-- 
cgit v1.2.3-71-gd317


From 2a56a1fec290bf0bc4676bbf4efdb3744953a3e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 7 Dec 2015 17:38:52 -0500
Subject: net: wrap sock->sk_cgrp_prioidx and ->sk_classid inside a struct

Introduce sock->sk_cgrp_data which is a struct sock_cgroup_data.
->sk_cgroup_prioidx and ->sk_classid are moved into it.  The struct
and its accessors are defined in cgroup-defs.h.  This is to prepare
for overloading the fields with a cgroup pointer.

This patch mostly performs equivalent conversions but the followings
are noteworthy.

* Equality test before updating classid is removed from
  sock_update_classid().  This shouldn't make any noticeable
  difference and a similar test will be implemented on the helper side
  later.

* sock_update_netprioidx() now takes struct sock_cgroup_data and can
  be moved to netprio_cgroup.h without causing include dependency
  loop.  Moved.

* The dummy version of sock_update_netprioidx() converted to a static
  inline function while at it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cgroup-defs.h  | 36 ++++++++++++++++++++++++++++++++++++
 include/net/cls_cgroup.h     | 11 +++++------
 include/net/netprio_cgroup.h | 16 +++++++++++++---
 include/net/sock.h           | 11 +++--------
 net/Kconfig                  |  6 ++++++
 net/core/dev.c               |  3 ++-
 net/core/netclassid_cgroup.c |  4 ++--
 net/core/netprio_cgroup.c    |  3 ++-
 net/core/scm.c               |  4 ++--
 net/core/sock.c              | 15 ++-------------
 net/netfilter/nft_meta.c     |  2 +-
 net/netfilter/xt_cgroup.c    |  3 ++-
 12 files changed, 76 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 504d8591b6d3..ed128fed0335 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -542,4 +542,40 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
 
 #endif	/* CONFIG_CGROUPS */
 
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+struct sock_cgroup_data {
+	u16	prioidx;
+	u32	classid;
+};
+
+static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd)
+{
+	return skcd->prioidx;
+}
+
+static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd)
+{
+	return skcd->classid;
+}
+
+static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
+					   u16 prioidx)
+{
+	skcd->prioidx = prioidx;
+}
+
+static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
+					   u32 classid)
+{
+	skcd->classid = classid;
+}
+
+#else	/* CONFIG_SOCK_CGROUP_DATA */
+
+struct sock_cgroup_data {
+};
+
+#endif	/* CONFIG_SOCK_CGROUP_DATA */
+
 #endif	/* _LINUX_CGROUP_DEFS_H */
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index ccd6d8bffa4d..c0a92e2c286d 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -41,13 +41,12 @@ static inline u32 task_cls_classid(struct task_struct *p)
 	return classid;
 }
 
-static inline void sock_update_classid(struct sock *sk)
+static inline void sock_update_classid(struct sock_cgroup_data *skcd)
 {
 	u32 classid;
 
 	classid = task_cls_classid(current);
-	if (classid != sk->sk_classid)
-		sk->sk_classid = classid;
+	sock_cgroup_set_classid(skcd, classid);
 }
 
 static inline u32 task_get_classid(const struct sk_buff *skb)
@@ -64,17 +63,17 @@ static inline u32 task_get_classid(const struct sk_buff *skb)
 	 * softirqs always disables bh.
 	 */
 	if (in_serving_softirq()) {
-		/* If there is an sk_classid we'll use that. */
+		/* If there is an sock_cgroup_classid we'll use that. */
 		if (!skb->sk)
 			return 0;
 
-		classid = skb->sk->sk_classid;
+		classid = sock_cgroup_classid(&skb->sk->sk_cgrp_data);
 	}
 
 	return classid;
 }
 #else /* !CONFIG_CGROUP_NET_CLASSID */
-static inline void sock_update_classid(struct sock *sk)
+static inline void sock_update_classid(struct sock_cgroup_data *skcd)
 {
 }
 
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index f2a9597ff53c..604190596cde 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -25,8 +25,6 @@ struct netprio_map {
 	u32 priomap[];
 };
 
-void sock_update_netprioidx(struct sock *sk);
-
 static inline u32 task_netprioidx(struct task_struct *p)
 {
 	struct cgroup_subsys_state *css;
@@ -38,13 +36,25 @@ static inline u32 task_netprioidx(struct task_struct *p)
 	rcu_read_unlock();
 	return idx;
 }
+
+static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd)
+{
+	if (in_interrupt())
+		return;
+
+	sock_cgroup_set_prioidx(skcd, task_netprioidx(current));
+}
+
 #else /* !CONFIG_CGROUP_NET_PRIO */
+
 static inline u32 task_netprioidx(struct task_struct *p)
 {
 	return 0;
 }
 
-#define sock_update_netprioidx(sk)
+static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd)
+{
+}
 
 #endif /* CONFIG_CGROUP_NET_PRIO */
 #endif  /* _NET_CLS_CGROUP_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index a95bcf7d6efa..0ca22b014de1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -59,6 +59,7 @@
 #include <linux/static_key.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/cgroup-defs.h>
 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
@@ -308,8 +309,7 @@ struct cg_proto;
   *	@sk_send_head: front of stuff to transmit
   *	@sk_security: used by security modules
   *	@sk_mark: generic packet mark
-  *	@sk_cgrp_prioidx: socket group's priority map index
-  *	@sk_classid: this socket's cgroup classid
+  *	@sk_cgrp_data: cgroup data for this cgroup
   *	@sk_cgrp: this socket's cgroup-specific proto data
   *	@sk_write_pending: a write to stream socket waits to start
   *	@sk_state_change: callback to indicate change in the state of the sock
@@ -443,12 +443,7 @@ struct sock {
 #ifdef CONFIG_SECURITY
 	void			*sk_security;
 #endif
-#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
-	u16			sk_cgrp_prioidx;
-#endif
-#ifdef CONFIG_CGROUP_NET_CLASSID
-	u32			sk_classid;
-#endif
+	struct sock_cgroup_data	sk_cgrp_data;
 	struct cg_proto		*sk_cgrp;
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk);
diff --git a/net/Kconfig b/net/Kconfig
index 127da94ae25e..11f8c22af34d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -250,9 +250,14 @@ config XPS
 	depends on SMP
 	default y
 
+config SOCK_CGROUP_DATA
+	bool
+	default n
+
 config CGROUP_NET_PRIO
 	bool "Network priority cgroup"
 	depends on CGROUPS
+	select SOCK_CGROUP_DATA
 	---help---
 	  Cgroup subsystem for use in assigning processes to network priorities on
 	  a per-interface basis.
@@ -260,6 +265,7 @@ config CGROUP_NET_PRIO
 config CGROUP_NET_CLASSID
 	bool "Network classid cgroup"
 	depends on CGROUPS
+	select SOCK_CGROUP_DATA
 	---help---
 	  Cgroup subsystem for use as general purpose socket classid marker that is
 	  being used in cls_cgroup and for netfilter matching.
diff --git a/net/core/dev.c b/net/core/dev.c
index e5c395473eba..8f705fcedb94 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2929,7 +2929,8 @@ static void skb_update_prio(struct sk_buff *skb)
 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
 
 	if (!skb->priority && skb->sk && map) {
-		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
+		unsigned int prioidx =
+			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
 
 		if (prioidx < map->priomap_len)
 			skb->priority = map->priomap[prioidx];
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 2e4df84c34a1..e60ded46b3ac 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -62,8 +62,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
 	struct socket *sock = sock_from_file(file, &err);
 
 	if (sock)
-		sock->sk->sk_classid = (u32)(unsigned long)v;
-
+		sock_cgroup_set_classid(&sock->sk->sk_cgrp_data,
+					(unsigned long)v);
 	return 0;
 }
 
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 2b9159b7a28a..de42aa7f6c77 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -223,7 +223,8 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
 	int err;
 	struct socket *sock = sock_from_file(file, &err);
 	if (sock)
-		sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
+		sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
+					(unsigned long)v);
 	return 0;
 }
 
diff --git a/net/core/scm.c b/net/core/scm.c
index 8a1741b14302..14596fb37172 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -289,8 +289,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 		/* Bump the usage count and install the file. */
 		sock = sock_from_file(fp[i], &err);
 		if (sock) {
-			sock_update_netprioidx(sock->sk);
-			sock_update_classid(sock->sk);
+			sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+			sock_update_classid(&sock->sk->sk_cgrp_data);
 		}
 		fd_install(new_fd, get_file(fp[i]));
 	}
diff --git a/net/core/sock.c b/net/core/sock.c
index 7965ef487375..947741dc43fa 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1393,17 +1393,6 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
 	module_put(owner);
 }
 
-#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
-void sock_update_netprioidx(struct sock *sk)
-{
-	if (in_interrupt())
-		return;
-
-	sk->sk_cgrp_prioidx = task_netprioidx(current);
-}
-EXPORT_SYMBOL_GPL(sock_update_netprioidx);
-#endif
-
 /**
  *	sk_alloc - All socket objects are allocated here
  *	@net: the applicable net namespace
@@ -1432,8 +1421,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		sock_net_set(sk, net);
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
-		sock_update_classid(sk);
-		sock_update_netprioidx(sk);
+		sock_update_classid(&sk->sk_cgrp_data);
+		sock_update_netprioidx(&sk->sk_cgrp_data);
 	}
 
 	return sk;
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 9dfaf4d55ee0..1915cab7f32d 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -174,7 +174,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		sk = skb_to_full_sk(skb);
 		if (!sk || !sk_fullsock(sk))
 			goto err;
-		*dest = sk->sk_classid;
+		*dest = sock_cgroup_classid(&sk->sk_cgrp_data);
 		break;
 #endif
 	default:
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index a1d126f29463..54eaeb45ce99 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -42,7 +42,8 @@ cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (skb->sk == NULL || !sk_fullsock(skb->sk))
 		return false;
 
-	return (info->id == skb->sk->sk_classid) ^ info->invert;
+	return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^
+		info->invert;
 }
 
 static struct xt_match cgroup_mt_reg __read_mostly = {
-- 
cgit v1.2.3-71-gd317


From bd1060a1d67128bb8fbe2e1384c518912cbe54e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 7 Dec 2015 17:38:53 -0500
Subject: sock, cgroup: add sock->sk_cgroup

In cgroup v1, dealing with cgroup membership was difficult because the
number of membership associations was unbound.  As a result, cgroup v1
grew several controllers whose primary purpose is either tagging
membership or pull in configuration knobs from other subsystems so
that cgroup membership test can be avoided.

net_cls and net_prio controllers are examples of the latter.  They
allow configuring network-specific attributes from cgroup side so that
network subsystem can avoid testing cgroup membership; unfortunately,
these are not only cumbersome but also problematic.

Both net_cls and net_prio aren't properly hierarchical.  Both inherit
configuration from the parent on creation but there's no interaction
afterwards.  An ancestor doesn't restrict the behavior in its subtree
in anyway and configuration changes aren't propagated downwards.
Especially when combined with cgroup delegation, this is problematic
because delegatees can mess up whatever network configuration
implemented at the system level.  net_prio would allow the delegatees
to set whatever priority value regardless of CAP_NET_ADMIN and net_cls
the same for classid.

While it is possible to solve these issues from controller side by
implementing hierarchical allowable ranges in both controllers, it
would involve quite a bit of complexity in the controllers and further
obfuscate network configuration as it becomes even more difficult to
tell what's actually being configured looking from the network side.
While not much can be done for v1 at this point, as membership
handling is sane on cgroup v2, it'd be better to make cgroup matching
behave like other network matches and classifiers than introducing
further complications.

In preparation, this patch updates sock->sk_cgrp_data handling so that
it points to the v2 cgroup that sock was created in until either
net_prio or net_cls is used.  Once either of the two is used,
sock->sk_cgrp_data reverts to its previous role of carrying prioidx
and classid.  This is to avoid adding yet another cgroup related field
to struct sock.

As the mode switching can happen at most once per boot, the switching
mechanism is aimed at lowering hot path overhead.  It may leak a
finite, likely small, number of cgroup refs and report spurious
prioidx or classid on switching; however, dynamic updates of prioidx
and classid have always been racy and lossy - socks between creation
and fd installation are never updated, config changes don't update
existing sockets at all, and prioidx may index with dead and recycled
cgroup IDs.  Non-critical inaccuracies from small race windows won't
make any noticeable difference.

This patch doesn't make use of the pointer yet.  The following patch
will implement netfilter match for cgroup2 membership.

v2: Use sock_cgroup_data to avoid inflating struct sock w/ another
    cgroup specific field.

v3: Add comments explaining why sock_data_prioidx() and
    sock_data_classid() use different fallback values.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Daniel Wagner <daniel.wagner@bmw-carit.de>
CC: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cgroup-defs.h  | 88 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/cgroup.h       | 41 +++++++++++++++++++++
 kernel/cgroup.c              | 55 ++++++++++++++++++++++++++-
 net/core/netclassid_cgroup.c |  7 +++-
 net/core/netprio_cgroup.c    |  7 +++-
 net/core/sock.c              |  2 +
 6 files changed, 191 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ed128fed0335..9dc226345e4e 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -544,31 +544,107 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
 
 #ifdef CONFIG_SOCK_CGROUP_DATA
 
+/*
+ * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
+ * per-socket cgroup information except for memcg association.
+ *
+ * On legacy hierarchies, net_prio and net_cls controllers directly set
+ * attributes on each sock which can then be tested by the network layer.
+ * On the default hierarchy, each sock is associated with the cgroup it was
+ * created in and the networking layer can match the cgroup directly.
+ *
+ * To avoid carrying all three cgroup related fields separately in sock,
+ * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
+ * On boot, sock_cgroup_data records the cgroup that the sock was created
+ * in so that cgroup2 matches can be made; however, once either net_prio or
+ * net_cls starts being used, the area is overriden to carry prioidx and/or
+ * classid.  The two modes are distinguished by whether the lowest bit is
+ * set.  Clear bit indicates cgroup pointer while set bit prioidx and
+ * classid.
+ *
+ * While userland may start using net_prio or net_cls at any time, once
+ * either is used, cgroup2 matching no longer works.  There is no reason to
+ * mix the two and this is in line with how legacy and v2 compatibility is
+ * handled.  On mode switch, cgroup references which are already being
+ * pointed to by socks may be leaked.  While this can be remedied by adding
+ * synchronization around sock_cgroup_data, given that the number of leaked
+ * cgroups is bound and highly unlikely to be high, this seems to be the
+ * better trade-off.
+ */
 struct sock_cgroup_data {
-	u16	prioidx;
-	u32	classid;
+	union {
+#ifdef __LITTLE_ENDIAN
+		struct {
+			u8	is_data;
+			u8	padding;
+			u16	prioidx;
+			u32	classid;
+		} __packed;
+#else
+		struct {
+			u32	classid;
+			u16	prioidx;
+			u8	padding;
+			u8	is_data;
+		} __packed;
+#endif
+		u64		val;
+	};
 };
 
+/*
+ * There's a theoretical window where the following accessors race with
+ * updaters and return part of the previous pointer as the prioidx or
+ * classid.  Such races are short-lived and the result isn't critical.
+ */
 static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd)
 {
-	return skcd->prioidx;
+	/* fallback to 1 which is always the ID of the root cgroup */
+	return (skcd->is_data & 1) ? skcd->prioidx : 1;
 }
 
 static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd)
 {
-	return skcd->classid;
+	/* fallback to 0 which is the unconfigured default classid */
+	return (skcd->is_data & 1) ? skcd->classid : 0;
 }
 
+/*
+ * If invoked concurrently, the updaters may clobber each other.  The
+ * caller is responsible for synchronization.
+ */
 static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
 					   u16 prioidx)
 {
-	skcd->prioidx = prioidx;
+	struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) };
+
+	if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
+		return;
+
+	if (!(skcd_buf.is_data & 1)) {
+		skcd_buf.val = 0;
+		skcd_buf.is_data = 1;
+	}
+
+	skcd_buf.prioidx = prioidx;
+	WRITE_ONCE(skcd->val, skcd_buf.val);	/* see sock_cgroup_ptr() */
 }
 
 static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
 					   u32 classid)
 {
-	skcd->classid = classid;
+	struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) };
+
+	if (sock_cgroup_classid(&skcd_buf) == classid)
+		return;
+
+	if (!(skcd_buf.is_data & 1)) {
+		skcd_buf.val = 0;
+		skcd_buf.is_data = 1;
+	}
+
+	skcd_buf.classid = classid;
+	WRITE_ONCE(skcd->val, skcd_buf.val);	/* see sock_cgroup_ptr() */
 }
 
 #else	/* CONFIG_SOCK_CGROUP_DATA */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4c3ffab81ba7..a8ba1ea0ea5a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -578,4 +578,45 @@ static inline int cgroup_init(void) { return 0; }
 
 #endif /* !CONFIG_CGROUPS */
 
+/*
+ * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+extern spinlock_t cgroup_sk_update_lock;
+#endif
+
+void cgroup_sk_alloc_disable(void);
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
+void cgroup_sk_free(struct sock_cgroup_data *skcd);
+
+static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
+{
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+	unsigned long v;
+
+	/*
+	 * @skcd->val is 64bit but the following is safe on 32bit too as we
+	 * just need the lower ulong to be written and read atomically.
+	 */
+	v = READ_ONCE(skcd->val);
+
+	if (v & 1)
+		return &cgrp_dfl_root.cgrp;
+
+	return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
+#else
+	return (struct cgroup *)(unsigned long)skcd->val;
+#endif
+}
+
+#else	/* CONFIG_CGROUP_DATA */
+
+static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
+static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
+
+#endif	/* CONFIG_CGROUP_DATA */
+
 #endif /* _LINUX_CGROUP_H */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3db5e8f5b702..4f8f7927b422 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,8 +57,8 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
 #include <linux/delay.h>
-
 #include <linux/atomic.h>
+#include <net/sock.h>
 
 /*
  * pidlists linger the following amount before being destroyed.  The goal
@@ -5782,6 +5782,59 @@ struct cgroup *cgroup_get_from_path(const char *path)
 }
 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
 
+/*
+ * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+
+spinlock_t cgroup_sk_update_lock;
+static bool cgroup_sk_alloc_disabled __read_mostly;
+
+void cgroup_sk_alloc_disable(void)
+{
+	if (cgroup_sk_alloc_disabled)
+		return;
+	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+	cgroup_sk_alloc_disabled = true;
+}
+
+#else
+
+#define cgroup_sk_alloc_disabled	false
+
+#endif
+
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+{
+	if (cgroup_sk_alloc_disabled)
+		return;
+
+	rcu_read_lock();
+
+	while (true) {
+		struct css_set *cset;
+
+		cset = task_css_set(current);
+		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+			skcd->val = (unsigned long)cset->dfl_cgrp;
+			break;
+		}
+		cpu_relax();
+	}
+
+	rcu_read_unlock();
+}
+
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
+{
+	cgroup_put(sock_cgroup_ptr(skcd));
+}
+
+#endif	/* CONFIG_SOCK_CGROUP_DATA */
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index e60ded46b3ac..04257a0e3534 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -61,9 +61,12 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
 	int err;
 	struct socket *sock = sock_from_file(file, &err);
 
-	if (sock)
+	if (sock) {
+		spin_lock(&cgroup_sk_update_lock);
 		sock_cgroup_set_classid(&sock->sk->sk_cgrp_data,
 					(unsigned long)v);
+		spin_unlock(&cgroup_sk_update_lock);
+	}
 	return 0;
 }
 
@@ -98,6 +101,8 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
 {
 	struct cgroup_cls_state *cs = css_cls_state(css);
 
+	cgroup_sk_alloc_disable();
+
 	cs->classid = (u32)value;
 
 	update_classid(css, (void *)(unsigned long)cs->classid);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index de42aa7f6c77..053d60c33395 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -209,6 +209,8 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
 	if (!dev)
 		return -ENODEV;
 
+	cgroup_sk_alloc_disable();
+
 	rtnl_lock();
 
 	ret = netprio_set_prio(of_css(of), dev, prio);
@@ -222,9 +224,12 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
 {
 	int err;
 	struct socket *sock = sock_from_file(file, &err);
-	if (sock)
+	if (sock) {
+		spin_lock(&cgroup_sk_update_lock);
 		sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
 					(unsigned long)v);
+		spin_unlock(&cgroup_sk_update_lock);
+	}
 	return 0;
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 947741dc43fa..1278d7b7bd9a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1363,6 +1363,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		if (!try_module_get(prot->owner))
 			goto out_free_sec;
 		sk_tx_queue_clear(sk);
+		cgroup_sk_alloc(&sk->sk_cgrp_data);
 	}
 
 	return sk;
@@ -1385,6 +1386,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
 	owner = prot->owner;
 	slab = prot->slab;
 
+	cgroup_sk_free(&sk->sk_cgrp_data);
 	security_sk_free(sk);
 	if (slab != NULL)
 		kmem_cache_free(slab, sk);
-- 
cgit v1.2.3-71-gd317


From ad2c8c73d29702c3193f739390f6661f9a4ecad9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Dec 2015 12:30:46 -0500
Subject: cgroup: fix sock_cgroup_data initialization on earlier compilers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sock_cgroup_data is a struct containing an anonymous union.
sock_cgroup_set_prioidx() and sock_cgroup_set_classid() were
initializing a field inside the anonymous union as follows.

 struct sock_ccgroup_data skcd_buf = { .val = VAL };

While this is fine on more recent compilers, gcc-4.4.7 triggers the
following errors.

 include/linux/cgroup-defs.h: In function ‘sock_cgroup_set_prioidx’:
 include/linux/cgroup-defs.h:619: error: unknown field ‘val’ specified in initializer
 include/linux/cgroup-defs.h:619: warning: missing braces around initializer
 include/linux/cgroup-defs.h:619: warning: (near initialization for ‘skcd_buf.<anonymous>’)

This is because .val belongs to the anonymous union nested inside the
struct but the initializer is missing the nesting.  Fix it by adding
an extra pair of braces.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Alaa Hleihel <alaa@dev.mellanox.co.il>
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cgroup-defs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 9dc226345e4e..097901a68671 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -616,7 +616,7 @@ static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd)
 static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
 					   u16 prioidx)
 {
-	struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) };
+	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
 
 	if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
 		return;
@@ -633,7 +633,7 @@ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
 static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
 					   u32 classid)
 {
-	struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) };
+	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
 
 	if (sock_cgroup_classid(&skcd_buf) == classid)
 		return;
-- 
cgit v1.2.3-71-gd317


From 899077791403ff7a2d8cfaa87bd1a82d729463e2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 8 Dec 2015 16:32:27 +0100
Subject: netcp: try to reduce type confusion in descriptors

The netcp driver produces tons of warnings when CONFIG_LPAE is enabled
on ARM:

drivers/net/ethernet/ti/netcp_core.c: In function 'netcp_tx_map_skb':
drivers/net/ethernet/ti/netcp_core.c:1084:13: warning: passing argument 1 of 'set_words' from incompatible pointer type [-Wincompatible-pointer-types]

This is the result of trying to pass a pointer to a dma_addr_t to
a function that expects a u32 pointer to copy that into a DMA descriptor.

Looking at that code in more detail to fix the warnings, I see multiple
related problems:

* The conversion functions are not endian-safe, as the DMA descriptors
  are almost certainly fixed-endian, but the CPU is not.

* On 64-bit machines, passing a pointer through a u32 variable is a
  bug, accessing an indirect pointer as a u32 pointer even more so.

* The handling of epib and psdata mixes native-endian and device-endian
  data.

In this patch, I try to sort out the types for most accesses here,
adding le32_to_cpu/cpu_to_le32 where appropriate, and passing pointers
through two 32-bit words in the descriptor padding, to make it plausible
that the driver does the right thing if compiled for big-endian or
64-bit systems.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_core.c | 123 ++++++++++++++++++++---------------
 include/linux/soc/ti/knav_dma.h      |  22 +++----
 2 files changed, 82 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index e5e20e734f21..eb2585e777e1 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -109,69 +109,80 @@ module_param(netcp_debug_level, int, 0);
 MODULE_PARM_DESC(netcp_debug_level, "Netcp debug level (NETIF_MSG bits) (0=none,...,16=all)");
 
 /* Helper functions - Get/Set */
-static void get_pkt_info(u32 *buff, u32 *buff_len, u32 *ndesc,
+static void get_pkt_info(dma_addr_t *buff, u32 *buff_len, dma_addr_t *ndesc,
 			 struct knav_dma_desc *desc)
 {
-	*buff_len = desc->buff_len;
-	*buff = desc->buff;
-	*ndesc = desc->next_desc;
+	*buff_len = le32_to_cpu(desc->buff_len);
+	*buff = le32_to_cpu(desc->buff);
+	*ndesc = le32_to_cpu(desc->next_desc);
 }
 
-static void get_pad_info(u32 *pad0, u32 *pad1, struct knav_dma_desc *desc)
+static void get_pad_info(u32 *pad0, u32 *pad1, u32 *pad2, struct knav_dma_desc *desc)
 {
-	*pad0 = desc->pad[0];
-	*pad1 = desc->pad[1];
+	*pad0 = le32_to_cpu(desc->pad[0]);
+	*pad1 = le32_to_cpu(desc->pad[1]);
+	*pad2 = le32_to_cpu(desc->pad[2]);
 }
 
-static void get_org_pkt_info(u32 *buff, u32 *buff_len,
+static void get_pad_ptr(void **padptr, struct knav_dma_desc *desc)
+{
+	u64 pad64;
+
+	pad64 = le32_to_cpu(desc->pad[0]) +
+		((u64)le32_to_cpu(desc->pad[1]) << 32);
+	*padptr = (void *)(uintptr_t)pad64;
+}
+
+static void get_org_pkt_info(dma_addr_t *buff, u32 *buff_len,
 			     struct knav_dma_desc *desc)
 {
-	*buff = desc->orig_buff;
-	*buff_len = desc->orig_len;
+	*buff = le32_to_cpu(desc->orig_buff);
+	*buff_len = le32_to_cpu(desc->orig_len);
 }
 
-static void get_words(u32 *words, int num_words, u32 *desc)
+static void get_words(dma_addr_t *words, int num_words, __le32 *desc)
 {
 	int i;
 
 	for (i = 0; i < num_words; i++)
-		words[i] = desc[i];
+		words[i] = le32_to_cpu(desc[i]);
 }
 
-static void set_pkt_info(u32 buff, u32 buff_len, u32 ndesc,
+static void set_pkt_info(dma_addr_t buff, u32 buff_len, u32 ndesc,
 			 struct knav_dma_desc *desc)
 {
-	desc->buff_len = buff_len;
-	desc->buff = buff;
-	desc->next_desc = ndesc;
+	desc->buff_len = cpu_to_le32(buff_len);
+	desc->buff = cpu_to_le32(buff);
+	desc->next_desc = cpu_to_le32(ndesc);
 }
 
 static void set_desc_info(u32 desc_info, u32 pkt_info,
 			  struct knav_dma_desc *desc)
 {
-	desc->desc_info = desc_info;
-	desc->packet_info = pkt_info;
+	desc->desc_info = cpu_to_le32(desc_info);
+	desc->packet_info = cpu_to_le32(pkt_info);
 }
 
-static void set_pad_info(u32 pad0, u32 pad1, struct knav_dma_desc *desc)
+static void set_pad_info(u32 pad0, u32 pad1, u32 pad2, struct knav_dma_desc *desc)
 {
-	desc->pad[0] = pad0;
-	desc->pad[1] = pad1;
+	desc->pad[0] = cpu_to_le32(pad0);
+	desc->pad[1] = cpu_to_le32(pad1);
+	desc->pad[2] = cpu_to_le32(pad1);
 }
 
-static void set_org_pkt_info(u32 buff, u32 buff_len,
+static void set_org_pkt_info(dma_addr_t buff, u32 buff_len,
 			     struct knav_dma_desc *desc)
 {
-	desc->orig_buff = buff;
-	desc->orig_len = buff_len;
+	desc->orig_buff = cpu_to_le32(buff);
+	desc->orig_len = cpu_to_le32(buff_len);
 }
 
-static void set_words(u32 *words, int num_words, u32 *desc)
+static void set_words(u32 *words, int num_words, __le32 *desc)
 {
 	int i;
 
 	for (i = 0; i < num_words; i++)
-		desc[i] = words[i];
+		desc[i] = cpu_to_le32(words[i]);
 }
 
 /* Read the e-fuse value as 32 bit values to be endian independent */
@@ -570,7 +581,7 @@ static void netcp_free_rx_desc_chain(struct netcp_intf *netcp,
 	dma_addr_t dma_desc, dma_buf;
 	unsigned int buf_len, dma_sz = sizeof(*ndesc);
 	void *buf_ptr;
-	u32 tmp;
+	u32 pad[2];
 
 	get_words(&dma_desc, 1, &desc->next_desc);
 
@@ -580,14 +591,15 @@ static void netcp_free_rx_desc_chain(struct netcp_intf *netcp,
 			dev_err(netcp->ndev_dev, "failed to unmap Rx desc\n");
 			break;
 		}
-		get_pkt_info(&dma_buf, &tmp, &dma_desc, ndesc);
-		get_pad_info((u32 *)&buf_ptr, &tmp, ndesc);
+		get_pad_ptr(&buf_ptr, ndesc);
 		dma_unmap_page(netcp->dev, dma_buf, PAGE_SIZE, DMA_FROM_DEVICE);
 		__free_page(buf_ptr);
 		knav_pool_desc_put(netcp->rx_pool, desc);
 	}
 
-	get_pad_info((u32 *)&buf_ptr, &buf_len, desc);
+	get_pad_info(&pad[0], &pad[1], &buf_len, desc);
+	buf_ptr = (void *)(uintptr_t)(pad[0] + ((u64)pad[1] << 32));
+
 	if (buf_ptr)
 		netcp_frag_free(buf_len <= PAGE_SIZE, buf_ptr);
 	knav_pool_desc_put(netcp->rx_pool, desc);
@@ -626,7 +638,6 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 	struct netcp_packet p_info;
 	struct sk_buff *skb;
 	void *org_buf_ptr;
-	u32 tmp;
 
 	dma_desc = knav_queue_pop(netcp->rx_queue, &dma_sz);
 	if (!dma_desc)
@@ -639,7 +650,7 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 	}
 
 	get_pkt_info(&dma_buff, &buf_len, &dma_desc, desc);
-	get_pad_info((u32 *)&org_buf_ptr, &org_buf_len, desc);
+	get_pad_ptr(&org_buf_ptr, desc);
 
 	if (unlikely(!org_buf_ptr)) {
 		dev_err(netcp->ndev_dev, "NULL bufptr in desc\n");
@@ -664,6 +675,7 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 	/* Fill in the page fragment list */
 	while (dma_desc) {
 		struct page *page;
+		void *ptr;
 
 		ndesc = knav_pool_desc_unmap(netcp->rx_pool, dma_desc, dma_sz);
 		if (unlikely(!ndesc)) {
@@ -672,14 +684,15 @@ static int netcp_process_one_rx_packet(struct netcp_intf *netcp)
 		}
 
 		get_pkt_info(&dma_buff, &buf_len, &dma_desc, ndesc);
-		get_pad_info((u32 *)&page, &tmp, ndesc);
+		get_pad_ptr(ptr, ndesc);
+		page = ptr;
 
 		if (likely(dma_buff && buf_len && page)) {
 			dma_unmap_page(netcp->dev, dma_buff, PAGE_SIZE,
 				       DMA_FROM_DEVICE);
 		} else {
-			dev_err(netcp->ndev_dev, "Bad Rx desc dma_buff(%p), len(%d), page(%p)\n",
-				(void *)dma_buff, buf_len, page);
+			dev_err(netcp->ndev_dev, "Bad Rx desc dma_buff(%pad), len(%d), page(%p)\n",
+				&dma_buff, buf_len, page);
 			goto free_desc;
 		}
 
@@ -750,7 +763,6 @@ static void netcp_free_rx_buf(struct netcp_intf *netcp, int fdq)
 	unsigned int buf_len, dma_sz;
 	dma_addr_t dma;
 	void *buf_ptr;
-	u32 tmp;
 
 	/* Allocate descriptor */
 	while ((dma = knav_queue_pop(netcp->rx_fdq[fdq], &dma_sz))) {
@@ -761,7 +773,7 @@ static void netcp_free_rx_buf(struct netcp_intf *netcp, int fdq)
 		}
 
 		get_org_pkt_info(&dma, &buf_len, desc);
-		get_pad_info((u32 *)&buf_ptr, &tmp, desc);
+		get_pad_ptr(buf_ptr, desc);
 
 		if (unlikely(!dma)) {
 			dev_err(netcp->ndev_dev, "NULL orig_buff in desc\n");
@@ -813,7 +825,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
 	struct page *page;
 	dma_addr_t dma;
 	void *bufptr;
-	u32 pad[2];
+	u32 pad[3];
 
 	/* Allocate descriptor */
 	hwdesc = knav_pool_desc_get(netcp->rx_pool);
@@ -830,7 +842,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
 				SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 		bufptr = netdev_alloc_frag(primary_buf_len);
-		pad[1] = primary_buf_len;
+		pad[2] = primary_buf_len;
 
 		if (unlikely(!bufptr)) {
 			dev_warn_ratelimited(netcp->ndev_dev,
@@ -842,7 +854,8 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
 		if (unlikely(dma_mapping_error(netcp->dev, dma)))
 			goto fail;
 
-		pad[0] = (u32)bufptr;
+		pad[0] = lower_32_bits((uintptr_t)bufptr);
+		pad[1] = upper_32_bits((uintptr_t)bufptr);
 
 	} else {
 		/* Allocate a secondary receive queue entry */
@@ -853,8 +866,9 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
 		}
 		buf_len = PAGE_SIZE;
 		dma = dma_map_page(netcp->dev, page, 0, buf_len, DMA_TO_DEVICE);
-		pad[0] = (u32)page;
-		pad[1] = 0;
+		pad[0] = lower_32_bits(dma);
+		pad[1] = upper_32_bits(dma);
+		pad[2] = 0;
 	}
 
 	desc_info =  KNAV_DMA_DESC_PS_INFO_IN_DESC;
@@ -864,7 +878,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
 	pkt_info |= (netcp->rx_queue_id & KNAV_DMA_DESC_RETQ_MASK) <<
 		    KNAV_DMA_DESC_RETQ_SHIFT;
 	set_org_pkt_info(dma, buf_len, hwdesc);
-	set_pad_info(pad[0], pad[1], hwdesc);
+	set_pad_info(pad[0], pad[1], pad[2], hwdesc);
 	set_desc_info(desc_info, pkt_info, hwdesc);
 
 	/* Push to FDQs */
@@ -935,8 +949,8 @@ static void netcp_free_tx_desc_chain(struct netcp_intf *netcp,
 			dma_unmap_single(netcp->dev, dma_buf, buf_len,
 					 DMA_TO_DEVICE);
 		else
-			dev_warn(netcp->ndev_dev, "bad Tx desc buf(%p), len(%d)\n",
-				 (void *)dma_buf, buf_len);
+			dev_warn(netcp->ndev_dev, "bad Tx desc buf(%pad), len(%d)\n",
+				 &dma_buf, buf_len);
 
 		knav_pool_desc_put(netcp->tx_pool, ndesc);
 		ndesc = NULL;
@@ -953,11 +967,11 @@ static int netcp_process_tx_compl_packets(struct netcp_intf *netcp,
 					  unsigned int budget)
 {
 	struct knav_dma_desc *desc;
+	void *ptr;
 	struct sk_buff *skb;
 	unsigned int dma_sz;
 	dma_addr_t dma;
 	int pkts = 0;
-	u32 tmp;
 
 	while (budget--) {
 		dma = knav_queue_pop(netcp->tx_compl_q, &dma_sz);
@@ -970,7 +984,8 @@ static int netcp_process_tx_compl_packets(struct netcp_intf *netcp,
 			continue;
 		}
 
-		get_pad_info((u32 *)&skb, &tmp, desc);
+		get_pad_ptr(&ptr, desc);
+		skb = ptr;
 		netcp_free_tx_desc_chain(netcp, desc, dma_sz);
 		if (!skb) {
 			dev_err(netcp->ndev_dev, "No skb in Tx desc\n");
@@ -1059,6 +1074,7 @@ netcp_tx_map_skb(struct sk_buff *skb, struct netcp_intf *netcp)
 		u32 page_offset = frag->page_offset;
 		u32 buf_len = skb_frag_size(frag);
 		dma_addr_t desc_dma;
+		u32 desc_dma_32;
 		u32 pkt_info;
 
 		dma_addr = dma_map_page(dev, page, page_offset, buf_len,
@@ -1075,13 +1091,13 @@ netcp_tx_map_skb(struct sk_buff *skb, struct netcp_intf *netcp)
 			goto free_descs;
 		}
 
-		desc_dma = knav_pool_desc_virt_to_dma(netcp->tx_pool,
-						      (void *)ndesc);
+		desc_dma = knav_pool_desc_virt_to_dma(netcp->tx_pool, ndesc);
 		pkt_info =
 			(netcp->tx_compl_qid & KNAV_DMA_DESC_RETQ_MASK) <<
 				KNAV_DMA_DESC_RETQ_SHIFT;
 		set_pkt_info(dma_addr, buf_len, 0, ndesc);
-		set_words(&desc_dma, 1, &pdesc->next_desc);
+		desc_dma_32 = (u32)desc_dma;
+		set_words(&desc_dma_32, 1, &pdesc->next_desc);
 		pkt_len += buf_len;
 		if (pdesc != desc)
 			knav_pool_desc_map(netcp->tx_pool, pdesc,
@@ -1173,11 +1189,14 @@ static int netcp_tx_submit_skb(struct netcp_intf *netcp,
 	}
 
 	set_words(&tmp, 1, &desc->packet_info);
-	set_words((u32 *)&skb, 1, &desc->pad[0]);
+	tmp = lower_32_bits((uintptr_t)&skb);
+	set_words(&tmp, 1, &desc->pad[0]);
+	tmp = upper_32_bits((uintptr_t)&skb);
+	set_words(&tmp, 1, &desc->pad[1]);
 
 	if (tx_pipe->flags & SWITCH_TO_PORT_IN_TAGINFO) {
 		tmp = tx_pipe->switch_to_port;
-		set_words((u32 *)&tmp, 1, &desc->tag_info);
+		set_words(&tmp, 1, &desc->tag_info);
 	}
 
 	/* submit packet descriptor */
diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h
index dad035c16d94..343c13ac4f71 100644
--- a/include/linux/soc/ti/knav_dma.h
+++ b/include/linux/soc/ti/knav_dma.h
@@ -144,17 +144,17 @@ struct knav_dma_cfg {
  * @psdata:			Protocol specific
  */
 struct knav_dma_desc {
-	u32	desc_info;
-	u32	tag_info;
-	u32	packet_info;
-	u32	buff_len;
-	u32	buff;
-	u32	next_desc;
-	u32	orig_len;
-	u32	orig_buff;
-	u32	epib[KNAV_DMA_NUM_EPIB_WORDS];
-	u32	psdata[KNAV_DMA_NUM_PS_WORDS];
-	u32	pad[4];
+	__le32	desc_info;
+	__le32	tag_info;
+	__le32	packet_info;
+	__le32	buff_len;
+	__le32	buff;
+	__le32	next_desc;
+	__le32	orig_len;
+	__le32	orig_buff;
+	__le32	epib[KNAV_DMA_NUM_EPIB_WORDS];
+	__le32	psdata[KNAV_DMA_NUM_PS_WORDS];
+	__le32	pad[4];
 } ____cacheline_aligned;
 
 #if IS_ENABLED(CONFIG_KEYSTONE_NAVIGATOR_DMA)
-- 
cgit v1.2.3-71-gd317


From 26a8145390b36cbe97a5bd0b9e97249f21af6aea Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Thu, 10 Dec 2015 17:12:39 +0200
Subject: net/mlx5_core: Introduce flow steering firmware commands

Introduce new Flow Steering (FS) firmware commands,
in-order to support the new flow steering infrastructure.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  | 239 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h  |  65 ++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |  81 ++++++++
 include/linux/mlx5/fs.h                           |  47 +++++
 include/linux/mlx5/mlx5_ifc.h                     |  32 ++-
 6 files changed, 455 insertions(+), 11 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
 create mode 100644 include/linux/mlx5/fs.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index a0755919ccaf..be10592e0518 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
 
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
-		mad.o transobj.o vport.o sriov.o
+		mad.o transobj.o vport.o sriov.o fs_cmd.o
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o flow_table.o eswitch.o \
 		en_main.o en_flow_table.o en_ethtool.o en_tx.o en_rx.o \
 		en_txrx.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
new file mode 100644
index 000000000000..5096f4f336bd
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/mlx5_ifc.h>
+
+#include "fs_core.h"
+#include "fs_cmd.h"
+#include "mlx5_core.h"
+
+int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
+			       enum fs_flow_table_type type, unsigned int level,
+			       unsigned int log_size, unsigned int *table_id)
+{
+	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)];
+	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(create_flow_table_in, in, opcode,
+		 MLX5_CMD_OP_CREATE_FLOW_TABLE);
+
+	MLX5_SET(create_flow_table_in, in, table_type, type);
+	MLX5_SET(create_flow_table_in, in, level, level);
+	MLX5_SET(create_flow_table_in, in, log_size, log_size);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					 sizeof(out));
+
+	if (!err)
+		*table_id = MLX5_GET(create_flow_table_out, out,
+				     table_id);
+	return err;
+}
+
+int mlx5_cmd_destroy_flow_table(struct mlx5_core_dev *dev,
+				struct mlx5_flow_table *ft)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)];
+	u32 out[MLX5_ST_SZ_DW(destroy_flow_table_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(destroy_flow_table_in, in, opcode,
+		 MLX5_CMD_OP_DESTROY_FLOW_TABLE);
+	MLX5_SET(destroy_flow_table_in, in, table_type, ft->type);
+	MLX5_SET(destroy_flow_table_in, in, table_id, ft->id);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+
+int mlx5_cmd_create_flow_group(struct mlx5_core_dev *dev,
+			       struct mlx5_flow_table *ft,
+			       u32 *in,
+			       unsigned int *group_id)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	u32 out[MLX5_ST_SZ_DW(create_flow_group_out)];
+	int err;
+
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(create_flow_group_in, in, opcode,
+		 MLX5_CMD_OP_CREATE_FLOW_GROUP);
+	MLX5_SET(create_flow_group_in, in, table_type, ft->type);
+	MLX5_SET(create_flow_group_in, in, table_id, ft->id);
+
+	err = mlx5_cmd_exec_check_status(dev, in,
+					 inlen, out,
+					 sizeof(out));
+	if (!err)
+		*group_id = MLX5_GET(create_flow_group_out, out,
+				     group_id);
+
+	return err;
+}
+
+int mlx5_cmd_destroy_flow_group(struct mlx5_core_dev *dev,
+				struct mlx5_flow_table *ft,
+				unsigned int group_id)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_flow_group_out)];
+	u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(destroy_flow_group_in, in, opcode,
+		 MLX5_CMD_OP_DESTROY_FLOW_GROUP);
+	MLX5_SET(destroy_flow_group_in, in, table_type, ft->type);
+	MLX5_SET(destroy_flow_group_in, in, table_id, ft->id);
+	MLX5_SET(destroy_flow_group_in, in, group_id, group_id);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+
+static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
+			    int opmod, int modify_mask,
+			    struct mlx5_flow_table *ft,
+			    unsigned group_id,
+			    struct fs_fte *fte)
+{
+	unsigned int inlen = MLX5_ST_SZ_BYTES(set_fte_in) +
+		fte->dests_size * MLX5_ST_SZ_BYTES(dest_format_struct);
+	u32 out[MLX5_ST_SZ_DW(set_fte_out)];
+	struct mlx5_flow_rule *dst;
+	void *in_flow_context;
+	void *in_match_value;
+	void *in_dests;
+	u32 *in;
+	int err;
+
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		mlx5_core_warn(dev, "failed to allocate inbox\n");
+		return -ENOMEM;
+	}
+
+	MLX5_SET(set_fte_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY);
+	MLX5_SET(set_fte_in, in, op_mod, opmod);
+	MLX5_SET(set_fte_in, in, modify_enable_mask, modify_mask);
+	MLX5_SET(set_fte_in, in, table_type, ft->type);
+	MLX5_SET(set_fte_in, in, table_id,   ft->id);
+	MLX5_SET(set_fte_in, in, flow_index, fte->index);
+
+	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
+	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
+	MLX5_SET(flow_context, in_flow_context, flow_tag, fte->flow_tag);
+	MLX5_SET(flow_context, in_flow_context, action, fte->action);
+	MLX5_SET(flow_context, in_flow_context, destination_list_size,
+		 fte->dests_size);
+	in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context,
+				      match_value);
+	memcpy(in_match_value, &fte->val, MLX5_ST_SZ_BYTES(fte_match_param));
+
+	in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination);
+	list_for_each_entry(dst, &fte->node.children, node.list) {
+		unsigned int id;
+
+		MLX5_SET(dest_format_struct, in_dests, destination_type,
+			 dst->dest_attr.type);
+		if (dst->dest_attr.type ==
+		    MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
+			id = dst->dest_attr.ft->id;
+		else
+			id = dst->dest_attr.tir_num;
+		MLX5_SET(dest_format_struct, in_dests, destination_id, id);
+		in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
+	}
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(dev, in, inlen, out,
+					 sizeof(out));
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5_cmd_create_fte(struct mlx5_core_dev *dev,
+			struct mlx5_flow_table *ft,
+			unsigned group_id,
+			struct fs_fte *fte)
+{
+	return	mlx5_cmd_set_fte(dev, 0, 0, ft, group_id, fte);
+}
+
+int mlx5_cmd_update_fte(struct mlx5_core_dev *dev,
+			struct mlx5_flow_table *ft,
+			unsigned group_id,
+			struct fs_fte *fte)
+{
+	int opmod;
+	int modify_mask;
+	int atomic_mod_cap = MLX5_CAP_FLOWTABLE(dev,
+						flow_table_properties_nic_receive.
+						flow_modify_en);
+	if (!atomic_mod_cap)
+		return -ENOTSUPP;
+	opmod = 1;
+	modify_mask = 1 <<
+		MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST;
+
+	return	mlx5_cmd_set_fte(dev, opmod, modify_mask, ft, group_id, fte);
+}
+
+int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
+			struct mlx5_flow_table *ft,
+			unsigned int index)
+{
+	u32 out[MLX5_ST_SZ_DW(delete_fte_out)];
+	u32 in[MLX5_ST_SZ_DW(delete_fte_in)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(delete_fte_in, in, opcode, MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY);
+	MLX5_SET(delete_fte_in, in, table_type, ft->type);
+	MLX5_SET(delete_fte_in, in, table_id, ft->id);
+	MLX5_SET(delete_fte_in, in, flow_index, index);
+
+	err =  mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
new file mode 100644
index 000000000000..f39304ede186
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX5_FS_CMD_
+#define _MLX5_FS_CMD_
+
+int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
+			       enum fs_flow_table_type type, unsigned int level,
+			       unsigned int log_size, unsigned int *table_id);
+
+int mlx5_cmd_destroy_flow_table(struct mlx5_core_dev *dev,
+				struct mlx5_flow_table *ft);
+
+int mlx5_cmd_create_flow_group(struct mlx5_core_dev *dev,
+			       struct mlx5_flow_table *ft,
+			       u32 *in, unsigned int *group_id);
+
+int mlx5_cmd_destroy_flow_group(struct mlx5_core_dev *dev,
+				struct mlx5_flow_table *ft,
+				unsigned int group_id);
+
+int mlx5_cmd_create_fte(struct mlx5_core_dev *dev,
+			struct mlx5_flow_table *ft,
+			unsigned group_id,
+			struct fs_fte *fte);
+
+int mlx5_cmd_update_fte(struct mlx5_core_dev *dev,
+			struct mlx5_flow_table *ft,
+			unsigned group_id,
+			struct fs_fte *fte);
+
+int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
+			struct mlx5_flow_table *ft,
+			unsigned int index);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
new file mode 100644
index 000000000000..e8b34a9b147b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX5_FS_CORE_
+#define _MLX5_FS_CORE_
+
+#include <linux/mlx5/fs.h>
+
+enum fs_node_type {
+	FS_TYPE_NAMESPACE,
+	FS_TYPE_PRIO,
+	FS_TYPE_FLOW_TABLE,
+	FS_TYPE_FLOW_GROUP,
+	FS_TYPE_FLOW_ENTRY,
+	FS_TYPE_FLOW_DEST
+};
+
+enum fs_flow_table_type {
+	FS_FT_NIC_RX	 = 0x0,
+};
+
+enum fs_fte_status {
+	FS_FTE_STATUS_EXISTING = 1UL << 0,
+};
+
+struct fs_node {
+	struct list_head	list;
+	struct list_head	children;
+	enum fs_node_type	type;
+};
+
+struct mlx5_flow_rule {
+	struct fs_node				node;
+	struct mlx5_flow_destination		dest_attr;
+};
+
+struct mlx5_flow_table {
+	struct fs_node			node;
+	u32				id;
+	enum fs_flow_table_type		type;
+};
+
+struct fs_fte {
+	struct fs_node			node;
+	u32				val[MLX5_ST_SZ_DW(fte_match_param)];
+	u32				dests_size;
+	u32				flow_tag;
+	u32				index;
+	u32				action;
+};
+
+#endif
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
new file mode 100644
index 000000000000..34fd8dc0b3e1
--- /dev/null
+++ b/include/linux/mlx5/fs.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX5_FS_
+#define _MLX5_FS_
+
+#include <linux/mlx5/mlx5_ifc.h>
+
+struct mlx5_flow_table;
+
+struct mlx5_flow_destination {
+	enum mlx5_flow_destination_type	type;
+	union {
+		u32			tir_num;
+		struct mlx5_flow_table	*ft;
+	};
+};
+#endif
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f5d94495758a..131a2737cfa3 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -256,25 +256,27 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 
 struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         ft_support[0x1];
-	u8         reserved_0[0x1f];
+	u8         reserved_0[0x2];
+	u8	   flow_modify_en[0x1];
+	u8         reserved_1[0x1c];
 
-	u8         reserved_1[0x2];
+	u8         reserved_2[0x2];
 	u8         log_max_ft_size[0x6];
-	u8         reserved_2[0x10];
+	u8         reserved_3[0x10];
 	u8         max_ft_level[0x8];
 
-	u8         reserved_3[0x20];
+	u8         reserved_4[0x20];
 
-	u8         reserved_4[0x18];
+	u8         reserved_5[0x18];
 	u8         log_max_ft_num[0x8];
 
-	u8         reserved_5[0x18];
+	u8         reserved_6[0x18];
 	u8         log_max_destination[0x8];
 
-	u8         reserved_6[0x18];
+	u8         reserved_7[0x18];
 	u8         log_max_flow[0x8];
 
-	u8         reserved_7[0x40];
+	u8         reserved_8[0x40];
 
 	struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support;
 
@@ -2843,6 +2845,13 @@ struct mlx5_ifc_set_hca_cap_in_bits {
 	union mlx5_ifc_hca_cap_union_bits capability;
 };
 
+enum {
+	MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION    = 0x0,
+	MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_TAG  = 0x1,
+	MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST    = 0x2,
+	MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS    = 0x3
+};
+
 struct mlx5_ifc_set_fte_out_bits {
 	u8         status[0x8];
 	u8         reserved_0[0x18];
@@ -2867,11 +2876,14 @@ struct mlx5_ifc_set_fte_in_bits {
 	u8         reserved_4[0x8];
 	u8         table_id[0x18];
 
-	u8         reserved_5[0x40];
+	u8         reserved_5[0x18];
+	u8         modify_enable_mask[0x8];
+
+	u8         reserved_6[0x20];
 
 	u8         flow_index[0x20];
 
-	u8         reserved_6[0xe0];
+	u8         reserved_7[0xe0];
 
 	struct mlx5_ifc_flow_context_bits flow_context;
 };
-- 
cgit v1.2.3-71-gd317


From 2530236303d9e705db6a28eb9a10c8d79b288b37 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Thu, 10 Dec 2015 17:12:43 +0200
Subject: net/mlx5_core: Flow steering tree initialization

Flow steering initialization is based on static tree which
illustrates the flow steering tree when the driver is loaded. The
initialization considers the max supported flow table level of the device,
a minimum of 2 kernel flow tables(vlan and mac) are required to have
kernel flow table functionality.

The tree structures when the driver is loaded:

		root_namespace(receive nic)
			  |
		priority-0 (kernel priority)
			  |
		namespace(kernel namespace)
			  |
		priority-0 (flow tables priority)

In the following patches, When the EN driver will use the flow steering
API, it create two flow tables and their flow groups under
priority-0(flow tables priority).

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 374 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |   4 +
 include/linux/mlx5/driver.h                       |   2 +
 include/linux/mlx5/fs.h                           |   8 +
 4 files changed, 388 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 1828351102c5..4264e8b34b76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -37,6 +37,54 @@
 #include "fs_core.h"
 #include "fs_cmd.h"
 
+#define INIT_TREE_NODE_ARRAY_SIZE(...)	(sizeof((struct init_tree_node[]){__VA_ARGS__}) /\
+					 sizeof(struct init_tree_node))
+
+#define INIT_PRIO(min_level_val, max_ft_val,\
+		  start_level_val, ...) {.type = FS_TYPE_PRIO,\
+	.min_ft_level = min_level_val,\
+	.start_level = start_level_val,\
+	.max_ft = max_ft_val,\
+	.children = (struct init_tree_node[]) {__VA_ARGS__},\
+	.ar_size = INIT_TREE_NODE_ARRAY_SIZE(__VA_ARGS__) \
+}
+
+#define ADD_PRIO(min_level_val, max_ft_val, start_level_val, ...)\
+	INIT_PRIO(min_level_val, max_ft_val, start_level_val,\
+		  __VA_ARGS__)\
+
+#define ADD_FT_PRIO(max_ft_val, start_level_val, ...)\
+	INIT_PRIO(0, max_ft_val, start_level_val,\
+		  __VA_ARGS__)\
+
+#define ADD_NS(...) {.type = FS_TYPE_NAMESPACE,\
+	.children = (struct init_tree_node[]) {__VA_ARGS__},\
+	.ar_size = INIT_TREE_NODE_ARRAY_SIZE(__VA_ARGS__) \
+}
+
+#define KERNEL_START_LEVEL 0
+#define KERNEL_P0_START_LEVEL KERNEL_START_LEVEL
+#define KERNEL_MAX_FT 2
+#define KENREL_MIN_LEVEL 2
+static struct init_tree_node {
+	enum fs_node_type	type;
+	struct init_tree_node *children;
+	int ar_size;
+	int min_ft_level;
+	int prio;
+	int max_ft;
+	int start_level;
+} root_fs = {
+	.type = FS_TYPE_NAMESPACE,
+	.ar_size = 1,
+	.children = (struct init_tree_node[]) {
+		ADD_PRIO(KENREL_MIN_LEVEL, KERNEL_MAX_FT,
+			 KERNEL_START_LEVEL,
+			 ADD_NS(ADD_FT_PRIO(KERNEL_MAX_FT,
+					    KERNEL_P0_START_LEVEL))),
+	}
+};
+
 static void del_rule(struct fs_node *node);
 static void del_flow_table(struct fs_node *node);
 static void del_flow_group(struct fs_node *node);
@@ -671,3 +719,329 @@ static void mlx5_destroy_flow_group(struct mlx5_flow_group *fg)
 		mlx5_core_warn(get_dev(&fg->node), "Flow group %d wasn't destroyed, refcount > 1\n",
 			       fg->id);
 }
+
+static struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
+							   enum mlx5_flow_namespace_type type)
+{
+	struct mlx5_flow_root_namespace *root_ns = dev->priv.root_ns;
+	int prio;
+	static struct fs_prio *fs_prio;
+	struct mlx5_flow_namespace *ns;
+
+	if (!root_ns)
+		return NULL;
+
+	switch (type) {
+	case MLX5_FLOW_NAMESPACE_KERNEL:
+		prio = 0;
+		break;
+	case MLX5_FLOW_NAMESPACE_FDB:
+		if (dev->priv.fdb_root_ns)
+			return &dev->priv.fdb_root_ns->ns;
+		else
+			return NULL;
+	default:
+		return NULL;
+	}
+
+	fs_prio = find_prio(&root_ns->ns, prio);
+	if (!fs_prio)
+		return NULL;
+
+	ns = list_first_entry(&fs_prio->node.children,
+			      typeof(*ns),
+			      node.list);
+
+	return ns;
+}
+
+static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
+				      unsigned prio, int max_ft,
+				      int start_level)
+{
+	struct fs_prio *fs_prio;
+
+	fs_prio = kzalloc(sizeof(*fs_prio), GFP_KERNEL);
+	if (!fs_prio)
+		return ERR_PTR(-ENOMEM);
+
+	fs_prio->node.type = FS_TYPE_PRIO;
+	tree_init_node(&fs_prio->node, 1, NULL);
+	tree_add_node(&fs_prio->node, &ns->node);
+	fs_prio->max_ft = max_ft;
+	fs_prio->prio = prio;
+	fs_prio->start_level = start_level;
+	list_add_tail(&fs_prio->node.list, &ns->node.children);
+
+	return fs_prio;
+}
+
+static struct mlx5_flow_namespace *fs_init_namespace(struct mlx5_flow_namespace
+						     *ns)
+{
+	ns->node.type = FS_TYPE_NAMESPACE;
+
+	return ns;
+}
+
+static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio)
+{
+	struct mlx5_flow_namespace	*ns;
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns)
+		return ERR_PTR(-ENOMEM);
+
+	fs_init_namespace(ns);
+	tree_init_node(&ns->node, 1, NULL);
+	tree_add_node(&ns->node, &prio->node);
+	list_add_tail(&ns->node.list, &prio->node.children);
+
+	return ns;
+}
+
+static int init_root_tree_recursive(int max_ft_level, struct init_tree_node *init_node,
+				    struct fs_node *fs_parent_node,
+				    struct init_tree_node *init_parent_node,
+				    int index)
+{
+	struct mlx5_flow_namespace *fs_ns;
+	struct fs_prio *fs_prio;
+	struct fs_node *base;
+	int i;
+	int err;
+
+	if (init_node->type == FS_TYPE_PRIO) {
+		if (init_node->min_ft_level > max_ft_level)
+			return -ENOTSUPP;
+
+		fs_get_obj(fs_ns, fs_parent_node);
+		fs_prio = fs_create_prio(fs_ns, index, init_node->max_ft,
+					 init_node->start_level);
+		if (IS_ERR(fs_prio))
+			return PTR_ERR(fs_prio);
+		base = &fs_prio->node;
+	} else if (init_node->type == FS_TYPE_NAMESPACE) {
+		fs_get_obj(fs_prio, fs_parent_node);
+		fs_ns = fs_create_namespace(fs_prio);
+		if (IS_ERR(fs_ns))
+			return PTR_ERR(fs_ns);
+		base = &fs_ns->node;
+	} else {
+		return -EINVAL;
+	}
+	for (i = 0; i < init_node->ar_size; i++) {
+		err = init_root_tree_recursive(max_ft_level,
+					       &init_node->children[i], base,
+					       init_node, i);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int init_root_tree(int max_ft_level, struct init_tree_node *init_node,
+			  struct fs_node *fs_parent_node)
+{
+	int i;
+	struct mlx5_flow_namespace *fs_ns;
+	int err;
+
+	fs_get_obj(fs_ns, fs_parent_node);
+	for (i = 0; i < init_node->ar_size; i++) {
+		err = init_root_tree_recursive(max_ft_level,
+					       &init_node->children[i],
+					       &fs_ns->node,
+					       init_node, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static struct mlx5_flow_root_namespace *create_root_ns(struct mlx5_core_dev *dev,
+						       enum fs_flow_table_type
+						       table_type)
+{
+	struct mlx5_flow_root_namespace *root_ns;
+	struct mlx5_flow_namespace *ns;
+
+	/* create the root namespace */
+	root_ns = mlx5_vzalloc(sizeof(*root_ns));
+	if (!root_ns)
+		return NULL;
+
+	root_ns->dev = dev;
+	root_ns->table_type = table_type;
+
+	ns = &root_ns->ns;
+	fs_init_namespace(ns);
+	tree_init_node(&ns->node, 1, NULL);
+	tree_add_node(&ns->node, NULL);
+
+	return root_ns;
+}
+
+static int init_root_ns(struct mlx5_core_dev *dev)
+{
+	int max_ft_level = MLX5_CAP_FLOWTABLE(dev,
+					      flow_table_properties_nic_receive.
+					      max_ft_level);
+
+	dev->priv.root_ns = create_root_ns(dev, FS_FT_NIC_RX);
+	if (IS_ERR_OR_NULL(dev->priv.root_ns))
+		goto cleanup;
+
+	if (init_root_tree(max_ft_level, &root_fs, &dev->priv.root_ns->ns.node))
+		goto cleanup;
+
+	return 0;
+
+cleanup:
+	mlx5_cleanup_fs(dev);
+	return -ENOMEM;
+}
+
+static void cleanup_single_prio_root_ns(struct mlx5_core_dev *dev,
+					struct mlx5_flow_root_namespace *root_ns)
+{
+	struct fs_node *prio;
+
+	if (!root_ns)
+		return;
+
+	if (!list_empty(&root_ns->ns.node.children)) {
+		prio = list_first_entry(&root_ns->ns.node.children,
+					struct fs_node,
+				 list);
+		if (tree_remove_node(prio))
+			mlx5_core_warn(dev,
+				       "Flow steering priority wasn't destroyed, refcount > 1\n");
+	}
+	if (tree_remove_node(&root_ns->ns.node))
+		mlx5_core_warn(dev,
+			       "Flow steering namespace wasn't destroyed, refcount > 1\n");
+	root_ns = NULL;
+}
+
+static void cleanup_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_root_namespace *root_ns = dev->priv.root_ns;
+	struct fs_prio *iter_prio;
+
+	if (!MLX5_CAP_GEN(dev, nic_flow_table))
+		return;
+
+	if (!root_ns)
+		return;
+
+	/* stage 1 */
+	fs_for_each_prio(iter_prio, &root_ns->ns) {
+		struct fs_node *node;
+		struct mlx5_flow_namespace *iter_ns;
+
+		fs_for_each_ns_or_ft(node, iter_prio) {
+			if (node->type == FS_TYPE_FLOW_TABLE)
+				continue;
+			fs_get_obj(iter_ns, node);
+			while (!list_empty(&iter_ns->node.children)) {
+				struct fs_prio *obj_iter_prio2;
+				struct fs_node *iter_prio2 =
+					list_first_entry(&iter_ns->node.children,
+							 struct fs_node,
+							 list);
+
+				fs_get_obj(obj_iter_prio2, iter_prio2);
+				if (tree_remove_node(iter_prio2)) {
+					mlx5_core_warn(dev,
+						       "Priority %d wasn't destroyed, refcount > 1\n",
+						       obj_iter_prio2->prio);
+					return;
+				}
+			}
+		}
+	}
+
+	/* stage 2 */
+	fs_for_each_prio(iter_prio, &root_ns->ns) {
+		while (!list_empty(&iter_prio->node.children)) {
+			struct fs_node *iter_ns =
+				list_first_entry(&iter_prio->node.children,
+						 struct fs_node,
+						 list);
+			if (tree_remove_node(iter_ns)) {
+				mlx5_core_warn(dev,
+					       "Namespace wasn't destroyed, refcount > 1\n");
+				return;
+			}
+		}
+	}
+
+	/* stage 3 */
+	while (!list_empty(&root_ns->ns.node.children)) {
+		struct fs_prio *obj_prio_node;
+		struct fs_node *prio_node =
+			list_first_entry(&root_ns->ns.node.children,
+					 struct fs_node,
+					 list);
+
+		fs_get_obj(obj_prio_node, prio_node);
+		if (tree_remove_node(prio_node)) {
+			mlx5_core_warn(dev,
+				       "Priority %d wasn't destroyed, refcount > 1\n",
+				       obj_prio_node->prio);
+			return;
+		}
+	}
+
+	if (tree_remove_node(&root_ns->ns.node)) {
+		mlx5_core_warn(dev,
+			       "root namespace wasn't destroyed, refcount > 1\n");
+		return;
+	}
+
+	dev->priv.root_ns = NULL;
+}
+
+void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
+{
+	cleanup_root_ns(dev);
+	cleanup_single_prio_root_ns(dev, dev->priv.fdb_root_ns);
+}
+
+static int init_fdb_root_ns(struct mlx5_core_dev *dev)
+{
+	struct fs_prio *prio;
+
+	dev->priv.fdb_root_ns = create_root_ns(dev, FS_FT_FDB);
+	if (!dev->priv.fdb_root_ns)
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&dev->priv.fdb_root_ns->ns, 0, 1, 0);
+	if (IS_ERR(prio)) {
+		cleanup_single_prio_root_ns(dev, dev->priv.fdb_root_ns);
+		return PTR_ERR(prio);
+	} else {
+		return 0;
+	}
+}
+
+int mlx5_init_fs(struct mlx5_core_dev *dev)
+{
+	int err = 0;
+
+	if (MLX5_CAP_GEN(dev, nic_flow_table)) {
+		err = init_root_ns(dev);
+		if (err)
+			return err;
+	}
+	if (MLX5_CAP_GEN(dev, eswitch_flow_table)) {
+		err = init_fdb_root_ns(dev);
+		if (err)
+			cleanup_root_ns(dev);
+	}
+
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 6c27b8ef42b7..4ebb97fd5544 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -46,6 +46,7 @@ enum fs_node_type {
 
 enum fs_flow_table_type {
 	FS_FT_NIC_RX	 = 0x0,
+	FS_FT_FDB	 = 0X4,
 };
 
 enum fs_fte_status {
@@ -125,6 +126,9 @@ struct mlx5_flow_root_namespace {
 	struct mlx5_core_dev		*dev;
 };
 
+int mlx5_init_fs(struct mlx5_core_dev *dev);
+void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
+
 #define fs_get_obj(v, _node)  {v = container_of((_node), typeof(*v), node); }
 
 #define fs_list_for_each_entry(pos, root)		\
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ac098b6b97bf..2fd7019f69db 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -502,6 +502,8 @@ struct mlx5_priv {
 	struct mlx5_eswitch     *eswitch;
 	struct mlx5_core_sriov	sriov;
 	unsigned long		pci_dev_data;
+	struct mlx5_flow_root_namespace *root_ns;
+	struct mlx5_flow_root_namespace *fdb_root_ns;
 };
 
 enum mlx5_device_state {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 34fd8dc0b3e1..16ae5233dc7b 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -35,6 +35,13 @@
 
 #include <linux/mlx5/mlx5_ifc.h>
 
+#define MLX5_FS_DEFAULT_FLOW_TAG 0x0
+
+enum mlx5_flow_namespace_type {
+	MLX5_FLOW_NAMESPACE_KERNEL,
+	MLX5_FLOW_NAMESPACE_FDB,
+};
+
 struct mlx5_flow_table;
 
 struct mlx5_flow_destination {
@@ -42,6 +49,7 @@ struct mlx5_flow_destination {
 	union {
 		u32			tir_num;
 		struct mlx5_flow_table	*ft;
+		u32			vport_num;
 	};
 };
 #endif
-- 
cgit v1.2.3-71-gd317


From 86d722ad2c3bd2f0536b196b7fd67ae2a7e2a492 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Thu, 10 Dec 2015 17:12:44 +0200
Subject: net/mlx5: Use flow steering infrastructure for mlx5_en

Expose the new flow steering API and remove the old
one.

Few changes are required:

1. The Ethernet flow steering follows the existing implementation, but uses
the new steering API. The old flow steering implementation is removed.

2. Move the E-switch FDB management to use the new API.

3. When driver is loaded call to mlx5_init_fs which initialize
the flow steering tree structure, open namespaces for NIC receive
and for E-switch FDB.

4. Call to mlx5_cleanup_fs when the driver is unloaded.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  23 +-
 .../ethernet/mellanox/mlx5/core/en_flow_table.c    | 824 +++++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 291 ++------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  15 +-
 .../net/ethernet/mellanox/mlx5/core/flow_table.c   | 422 -----------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |  26 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   9 +
 include/linux/mlx5/flow_table.h                    |  63 --
 include/linux/mlx5/fs.h                            |  38 +
 11 files changed, 633 insertions(+), 1082 deletions(-)
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/flow_table.c
 delete mode 100644 include/linux/mlx5/flow_table.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 7fc5e2388dec..11ee062965c5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -3,6 +3,6 @@ obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
 		mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o
-mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o flow_table.o eswitch.o \
+mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o \
 		en_main.o en_flow_table.o en_ethtool.o en_tx.o en_rx.o \
 		en_txrx.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 89313d46952d..f689ce580b44 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -64,6 +64,8 @@
 #define MLX5E_UPDATE_STATS_INTERVAL    200 /* msecs */
 #define MLX5E_SQ_BF_BUDGET             16
 
+#define MLX5E_NUM_MAIN_GROUPS 9
+
 static const char vport_strings[][ETH_GSTRING_LEN] = {
 	/* vport statistics */
 	"rx_packets",
@@ -442,7 +444,7 @@ enum mlx5e_rqt_ix {
 struct mlx5e_eth_addr_info {
 	u8  addr[ETH_ALEN + 2];
 	u32 tt_vec;
-	u32 ft_ix[MLX5E_NUM_TT]; /* flow table index per traffic type */
+	struct mlx5_flow_rule *ft_rule[MLX5E_NUM_TT];
 };
 
 #define MLX5E_ETH_ADDR_HASH_SIZE (1 << BITS_PER_BYTE)
@@ -466,15 +468,22 @@ enum {
 
 struct mlx5e_vlan_db {
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
-	u32           active_vlans_ft_ix[VLAN_N_VID];
-	u32           untagged_rule_ft_ix;
-	u32           any_vlan_rule_ft_ix;
+	struct mlx5_flow_rule	*active_vlans_rule[VLAN_N_VID];
+	struct mlx5_flow_rule	*untagged_rule;
+	struct mlx5_flow_rule	*any_vlan_rule;
 	bool          filter_disabled;
 };
 
 struct mlx5e_flow_table {
-	void *vlan;
-	void *main;
+	int num_groups;
+	struct mlx5_flow_table		*t;
+	struct mlx5_flow_group		**g;
+};
+
+struct mlx5e_flow_tables {
+	struct mlx5_flow_namespace	*ns;
+	struct mlx5e_flow_table		vlan;
+	struct mlx5e_flow_table		main;
 };
 
 struct mlx5e_priv {
@@ -497,7 +506,7 @@ struct mlx5e_priv {
 	u32                        rqtn[MLX5E_NUM_RQT];
 	u32                        tirn[MLX5E_NUM_TT];
 
-	struct mlx5e_flow_table    ft;
+	struct mlx5e_flow_tables   fts;
 	struct mlx5e_eth_addr_db   eth_addr;
 	struct mlx5e_vlan_db       vlan;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_flow_table.c b/drivers/net/ethernet/mellanox/mlx5/core/en_flow_table.c
index 5b93c9c6e341..80d81abc4820 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_flow_table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_flow_table.c
@@ -34,9 +34,11 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/tcp.h>
-#include <linux/mlx5/flow_table.h>
+#include <linux/mlx5/fs.h>
 #include "en.h"
 
+#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
+
 enum {
 	MLX5E_FULLMATCH = 0,
 	MLX5E_ALLMULTI  = 1,
@@ -103,44 +105,38 @@ static void mlx5e_del_eth_addr_from_hash(struct mlx5e_eth_addr_hash_node *hn)
 static void mlx5e_del_eth_addr_from_flow_table(struct mlx5e_priv *priv,
 					       struct mlx5e_eth_addr_info *ai)
 {
-	void *ft = priv->ft.main;
-
 	if (ai->tt_vec & BIT(MLX5E_TT_IPV6_IPSEC_ESP))
-		mlx5_del_flow_table_entry(ft,
-					  ai->ft_ix[MLX5E_TT_IPV6_IPSEC_ESP]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_IPSEC_ESP]);
 
 	if (ai->tt_vec & BIT(MLX5E_TT_IPV4_IPSEC_ESP))
-		mlx5_del_flow_table_entry(ft,
-					  ai->ft_ix[MLX5E_TT_IPV4_IPSEC_ESP]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_IPSEC_ESP]);
 
 	if (ai->tt_vec & BIT(MLX5E_TT_IPV6_IPSEC_AH))
-		mlx5_del_flow_table_entry(ft,
-					  ai->ft_ix[MLX5E_TT_IPV6_IPSEC_AH]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_IPSEC_AH]);
 
 	if (ai->tt_vec & BIT(MLX5E_TT_IPV4_IPSEC_AH))
-		mlx5_del_flow_table_entry(ft,
-					  ai->ft_ix[MLX5E_TT_IPV4_IPSEC_AH]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_IPSEC_AH]);
 
 	if (ai->tt_vec & BIT(MLX5E_TT_IPV6_TCP))
-		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV6_TCP]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_TCP]);
 
 	if (ai->tt_vec & BIT(MLX5E_TT_IPV4_TCP))
-		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV4_TCP]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_TCP]);
 
 	if (ai->tt_vec & BIT(MLX5E_TT_IPV6_UDP))
-		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV6_UDP]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6_UDP]);
 
 	if (ai->tt_vec & BIT(MLX5E_TT_IPV4_UDP))
-		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV4_UDP]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4_UDP]);
 
 	if (ai->tt_vec & BIT(MLX5E_TT_IPV6))
-		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV6]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV6]);
 
 	if (ai->tt_vec & BIT(MLX5E_TT_IPV4))
-		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV4]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_IPV4]);
 
 	if (ai->tt_vec & BIT(MLX5E_TT_ANY))
-		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_ANY]);
+		mlx5_del_flow_rule(ai->ft_rule[MLX5E_TT_ANY]);
 }
 
 static int mlx5e_get_eth_addr_type(u8 *addr)
@@ -240,44 +236,34 @@ static u32 mlx5e_get_tt_vec(struct mlx5e_eth_addr_info *ai, int type)
 }
 
 static int __mlx5e_add_eth_addr_rule(struct mlx5e_priv *priv,
-				     struct mlx5e_eth_addr_info *ai, int type,
-				     void *flow_context, void *match_criteria)
+				     struct mlx5e_eth_addr_info *ai,
+				     int type, u32 *mc, u32 *mv)
 {
+	struct mlx5_flow_destination dest;
 	u8 match_criteria_enable = 0;
-	void *match_value;
-	void *dest;
-	u8   *dmac;
-	u8   *match_criteria_dmac;
-	void *ft   = priv->ft.main;
-	u32  *tirn = priv->tirn;
-	u32  *ft_ix;
-	u32  tt_vec;
-	int  err;
-
-	match_value = MLX5_ADDR_OF(flow_context, flow_context, match_value);
-	dmac = MLX5_ADDR_OF(fte_match_param, match_value,
-			    outer_headers.dmac_47_16);
-	match_criteria_dmac = MLX5_ADDR_OF(fte_match_param, match_criteria,
-					   outer_headers.dmac_47_16);
-	dest = MLX5_ADDR_OF(flow_context, flow_context, destination);
-
-	MLX5_SET(flow_context, flow_context, action,
-		 MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
-	MLX5_SET(flow_context, flow_context, destination_list_size, 1);
-	MLX5_SET(dest_format_struct, dest, destination_type,
-		 MLX5_FLOW_CONTEXT_DEST_TYPE_TIR);
+	struct mlx5_flow_rule **rule_p;
+	struct mlx5_flow_table *ft = priv->fts.main.t;
+	u8 *mc_dmac = MLX5_ADDR_OF(fte_match_param, mc,
+				   outer_headers.dmac_47_16);
+	u8 *mv_dmac = MLX5_ADDR_OF(fte_match_param, mv,
+				   outer_headers.dmac_47_16);
+	u32 *tirn = priv->tirn;
+	u32 tt_vec;
+	int err = 0;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 
 	switch (type) {
 	case MLX5E_FULLMATCH:
 		match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-		memset(match_criteria_dmac, 0xff, ETH_ALEN);
-		ether_addr_copy(dmac, ai->addr);
+		eth_broadcast_addr(mc_dmac);
+		ether_addr_copy(mv_dmac, ai->addr);
 		break;
 
 	case MLX5E_ALLMULTI:
 		match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-		match_criteria_dmac[0] = 0x01;
-		dmac[0] = 0x01;
+		mc_dmac[0] = 0x01;
+		mv_dmac[0] = 0x01;
 		break;
 
 	case MLX5E_PROMISC:
@@ -286,190 +272,165 @@ static int __mlx5e_add_eth_addr_rule(struct mlx5e_priv *priv,
 
 	tt_vec = mlx5e_get_tt_vec(ai, type);
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_ANY];
 	if (tt_vec & BIT(MLX5E_TT_ANY)) {
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_ANY]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		rule_p = &ai->ft_rule[MLX5E_TT_ANY];
+		dest.tir_num = tirn[MLX5E_TT_ANY];
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
-
 		ai->tt_vec |= BIT(MLX5E_TT_ANY);
 	}
 
 	match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	MLX5_SET_TO_ONES(fte_match_param, match_criteria,
-			 outer_headers.ethertype);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_IPV4];
 	if (tt_vec & BIT(MLX5E_TT_IPV4)) {
-		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+		rule_p = &ai->ft_rule[MLX5E_TT_IPV4];
+		dest.tir_num = tirn[MLX5E_TT_IPV4];
+		MLX5_SET(fte_match_param, mv, outer_headers.ethertype,
 			 ETH_P_IP);
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_IPV4]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
-
 		ai->tt_vec |= BIT(MLX5E_TT_IPV4);
 	}
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_IPV6];
 	if (tt_vec & BIT(MLX5E_TT_IPV6)) {
-		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+		rule_p = &ai->ft_rule[MLX5E_TT_IPV6];
+		dest.tir_num = tirn[MLX5E_TT_IPV6];
+		MLX5_SET(fte_match_param, mv, outer_headers.ethertype,
 			 ETH_P_IPV6);
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_IPV6]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
-
 		ai->tt_vec |= BIT(MLX5E_TT_IPV6);
 	}
 
-	MLX5_SET_TO_ONES(fte_match_param, match_criteria,
-			 outer_headers.ip_protocol);
-	MLX5_SET(fte_match_param, match_value, outer_headers.ip_protocol,
-		 IPPROTO_UDP);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol);
+	MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_UDP);
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_IPV4_UDP];
 	if (tt_vec & BIT(MLX5E_TT_IPV4_UDP)) {
-		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+		rule_p = &ai->ft_rule[MLX5E_TT_IPV4_UDP];
+		dest.tir_num = tirn[MLX5E_TT_IPV4_UDP];
+		MLX5_SET(fte_match_param, mv, outer_headers.ethertype,
 			 ETH_P_IP);
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_IPV4_UDP]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
-
 		ai->tt_vec |= BIT(MLX5E_TT_IPV4_UDP);
 	}
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_IPV6_UDP];
 	if (tt_vec & BIT(MLX5E_TT_IPV6_UDP)) {
-		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+		rule_p = &ai->ft_rule[MLX5E_TT_IPV6_UDP];
+		dest.tir_num = tirn[MLX5E_TT_IPV6_UDP];
+		MLX5_SET(fte_match_param, mv, outer_headers.ethertype,
 			 ETH_P_IPV6);
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_IPV6_UDP]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
-
 		ai->tt_vec |= BIT(MLX5E_TT_IPV6_UDP);
 	}
 
-	MLX5_SET(fte_match_param, match_value, outer_headers.ip_protocol,
-		 IPPROTO_TCP);
+	MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_TCP);
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_IPV4_TCP];
 	if (tt_vec & BIT(MLX5E_TT_IPV4_TCP)) {
-		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+		rule_p = &ai->ft_rule[MLX5E_TT_IPV4_TCP];
+		dest.tir_num = tirn[MLX5E_TT_IPV4_TCP];
+		MLX5_SET(fte_match_param, mv, outer_headers.ethertype,
 			 ETH_P_IP);
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_IPV4_TCP]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
-
 		ai->tt_vec |= BIT(MLX5E_TT_IPV4_TCP);
 	}
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_IPV6_TCP];
 	if (tt_vec & BIT(MLX5E_TT_IPV6_TCP)) {
-		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+		rule_p = &ai->ft_rule[MLX5E_TT_IPV6_TCP];
+		dest.tir_num = tirn[MLX5E_TT_IPV6_TCP];
+		MLX5_SET(fte_match_param, mv, outer_headers.ethertype,
 			 ETH_P_IPV6);
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_IPV6_TCP]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
 
 		ai->tt_vec |= BIT(MLX5E_TT_IPV6_TCP);
 	}
 
-	MLX5_SET(fte_match_param, match_value, outer_headers.ip_protocol,
-		 IPPROTO_AH);
+	MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_AH);
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_IPV4_IPSEC_AH];
 	if (tt_vec & BIT(MLX5E_TT_IPV4_IPSEC_AH)) {
-		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+		rule_p = &ai->ft_rule[MLX5E_TT_IPV4_IPSEC_AH];
+		dest.tir_num = tirn[MLX5E_TT_IPV4_IPSEC_AH];
+		MLX5_SET(fte_match_param, mv, outer_headers.ethertype,
 			 ETH_P_IP);
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_IPV4_IPSEC_AH]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
-
 		ai->tt_vec |= BIT(MLX5E_TT_IPV4_IPSEC_AH);
 	}
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_IPV6_IPSEC_AH];
 	if (tt_vec & BIT(MLX5E_TT_IPV6_IPSEC_AH)) {
-		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+		rule_p = &ai->ft_rule[MLX5E_TT_IPV6_IPSEC_AH];
+		dest.tir_num = tirn[MLX5E_TT_IPV6_IPSEC_AH];
+		MLX5_SET(fte_match_param, mv, outer_headers.ethertype,
 			 ETH_P_IPV6);
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_IPV6_IPSEC_AH]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
-
 		ai->tt_vec |= BIT(MLX5E_TT_IPV6_IPSEC_AH);
 	}
 
-	MLX5_SET(fte_match_param, match_value, outer_headers.ip_protocol,
-		 IPPROTO_ESP);
+	MLX5_SET(fte_match_param, mv, outer_headers.ip_protocol, IPPROTO_ESP);
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_IPV4_IPSEC_ESP];
 	if (tt_vec & BIT(MLX5E_TT_IPV4_IPSEC_ESP)) {
-		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+		rule_p = &ai->ft_rule[MLX5E_TT_IPV4_IPSEC_ESP];
+		dest.tir_num = tirn[MLX5E_TT_IPV4_IPSEC_ESP];
+		MLX5_SET(fte_match_param, mv, outer_headers.ethertype,
 			 ETH_P_IP);
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_IPV4_IPSEC_ESP]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
-
 		ai->tt_vec |= BIT(MLX5E_TT_IPV4_IPSEC_ESP);
 	}
 
-	ft_ix = &ai->ft_ix[MLX5E_TT_IPV6_IPSEC_ESP];
 	if (tt_vec & BIT(MLX5E_TT_IPV6_IPSEC_ESP)) {
-		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+		rule_p = &ai->ft_rule[MLX5E_TT_IPV6_IPSEC_ESP];
+		dest.tir_num = tirn[MLX5E_TT_IPV6_IPSEC_ESP];
+		MLX5_SET(fte_match_param, mv, outer_headers.ethertype,
 			 ETH_P_IPV6);
-		MLX5_SET(dest_format_struct, dest, destination_id,
-			 tirn[MLX5E_TT_IPV6_IPSEC_ESP]);
-		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
-						match_criteria, flow_context,
-						ft_ix);
-		if (err)
+		*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+					     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					     MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+		if (IS_ERR_OR_NULL(*rule_p))
 			goto err_del_ai;
-
 		ai->tt_vec |= BIT(MLX5E_TT_IPV6_IPSEC_ESP);
 	}
 
 	return 0;
 
 err_del_ai:
+	err = PTR_ERR(*rule_p);
+	*rule_p = NULL;
 	mlx5e_del_eth_addr_from_flow_table(priv, ai);
 
 	return err;
@@ -478,27 +439,25 @@ err_del_ai:
 static int mlx5e_add_eth_addr_rule(struct mlx5e_priv *priv,
 				   struct mlx5e_eth_addr_info *ai, int type)
 {
-	u32 *flow_context;
 	u32 *match_criteria;
-	int err;
+	u32 *match_value;
+	int err = 0;
 
-	flow_context   = mlx5_vzalloc(MLX5_ST_SZ_BYTES(flow_context) +
-				      MLX5_ST_SZ_BYTES(dest_format_struct));
-	match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param));
-	if (!flow_context || !match_criteria) {
+	match_value	= mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param));
+	match_criteria	= mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param));
+	if (!match_value || !match_criteria) {
 		netdev_err(priv->netdev, "%s: alloc failed\n", __func__);
 		err = -ENOMEM;
 		goto add_eth_addr_rule_out;
 	}
 
-	err = __mlx5e_add_eth_addr_rule(priv, ai, type, flow_context,
-					match_criteria);
-	if (err)
-		netdev_err(priv->netdev, "%s: failed\n", __func__);
+	err = __mlx5e_add_eth_addr_rule(priv, ai, type, match_criteria,
+					match_value);
 
 add_eth_addr_rule_out:
 	kvfree(match_criteria);
-	kvfree(flow_context);
+	kvfree(match_value);
+
 	return err;
 }
 
@@ -551,72 +510,77 @@ enum mlx5e_vlan_rule_type {
 	MLX5E_VLAN_RULE_TYPE_MATCH_VID,
 };
 
-static int mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
-			       enum mlx5e_vlan_rule_type rule_type, u16 vid)
+static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
+				 enum mlx5e_vlan_rule_type rule_type,
+				 u16 vid, u32 *mc, u32 *mv)
 {
+	struct mlx5_flow_table *ft = priv->fts.vlan.t;
+	struct mlx5_flow_destination dest;
 	u8 match_criteria_enable = 0;
-	u32 *flow_context;
-	void *match_value;
-	void *dest;
-	u32 *match_criteria;
-	u32 *ft_ix;
-	int err;
+	struct mlx5_flow_rule **rule_p;
+	int err = 0;
 
-	flow_context   = mlx5_vzalloc(MLX5_ST_SZ_BYTES(flow_context) +
-				      MLX5_ST_SZ_BYTES(dest_format_struct));
-	match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param));
-	if (!flow_context || !match_criteria) {
-		netdev_err(priv->netdev, "%s: alloc failed\n", __func__);
-		err = -ENOMEM;
-		goto add_vlan_rule_out;
-	}
-	match_value = MLX5_ADDR_OF(flow_context, flow_context, match_value);
-	dest = MLX5_ADDR_OF(flow_context, flow_context, destination);
-
-	MLX5_SET(flow_context, flow_context, action,
-		 MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
-	MLX5_SET(flow_context, flow_context, destination_list_size, 1);
-	MLX5_SET(dest_format_struct, dest, destination_type,
-		 MLX5_FLOW_CONTEXT_DEST_TYPE_FLOW_TABLE);
-	MLX5_SET(dest_format_struct, dest, destination_id,
-		 mlx5_get_flow_table_id(priv->ft.main));
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest.ft = priv->fts.main.t;
 
 	match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	MLX5_SET_TO_ONES(fte_match_param, match_criteria,
-			 outer_headers.vlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.vlan_tag);
 
 	switch (rule_type) {
 	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
-		ft_ix = &priv->vlan.untagged_rule_ft_ix;
+		rule_p = &priv->vlan.untagged_rule;
 		break;
 	case MLX5E_VLAN_RULE_TYPE_ANY_VID:
-		ft_ix = &priv->vlan.any_vlan_rule_ft_ix;
-		MLX5_SET(fte_match_param, match_value, outer_headers.vlan_tag,
-			 1);
+		rule_p = &priv->vlan.any_vlan_rule;
+		MLX5_SET(fte_match_param, mv, outer_headers.vlan_tag, 1);
 		break;
 	default: /* MLX5E_VLAN_RULE_TYPE_MATCH_VID */
-		err = mlx5e_vport_context_update_vlans(priv);
-		if (err)
-			goto add_vlan_rule_out;
-
-		ft_ix = &priv->vlan.active_vlans_ft_ix[vid];
-		MLX5_SET(fte_match_param, match_value, outer_headers.vlan_tag,
-			 1);
-		MLX5_SET_TO_ONES(fte_match_param, match_criteria,
-				 outer_headers.first_vid);
-		MLX5_SET(fte_match_param, match_value, outer_headers.first_vid,
-			 vid);
+		rule_p = &priv->vlan.active_vlans_rule[vid];
+		MLX5_SET(fte_match_param, mv, outer_headers.vlan_tag, 1);
+		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid);
+		MLX5_SET(fte_match_param, mv, outer_headers.first_vid, vid);
 		break;
 	}
 
-	err = mlx5_add_flow_table_entry(priv->ft.vlan, match_criteria_enable,
-					match_criteria, flow_context, ft_ix);
-	if (err)
-		netdev_err(priv->netdev, "%s: failed\n", __func__);
+	*rule_p = mlx5_add_flow_rule(ft, match_criteria_enable, mc, mv,
+				     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				     MLX5_FS_DEFAULT_FLOW_TAG,
+				     &dest);
+
+	if (IS_ERR(*rule_p)) {
+		err = PTR_ERR(*rule_p);
+		*rule_p = NULL;
+		netdev_err(priv->netdev, "%s: add rule failed\n", __func__);
+	}
+
+	return err;
+}
+
+static int mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
+			       enum mlx5e_vlan_rule_type rule_type, u16 vid)
+{
+	u32 *match_criteria;
+	u32 *match_value;
+	int err = 0;
+
+	match_value	= mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param));
+	match_criteria	= mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param));
+	if (!match_value || !match_criteria) {
+		netdev_err(priv->netdev, "%s: alloc failed\n", __func__);
+		err = -ENOMEM;
+		goto add_vlan_rule_out;
+	}
+
+	if (rule_type == MLX5E_VLAN_RULE_TYPE_MATCH_VID)
+		mlx5e_vport_context_update_vlans(priv);
+
+	err = __mlx5e_add_vlan_rule(priv, rule_type, vid, match_criteria,
+				    match_value);
 
 add_vlan_rule_out:
 	kvfree(match_criteria);
-	kvfree(flow_context);
+	kvfree(match_value);
+
 	return err;
 }
 
@@ -625,16 +589,23 @@ static void mlx5e_del_vlan_rule(struct mlx5e_priv *priv,
 {
 	switch (rule_type) {
 	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
-		mlx5_del_flow_table_entry(priv->ft.vlan,
-					  priv->vlan.untagged_rule_ft_ix);
+		if (priv->vlan.untagged_rule) {
+			mlx5_del_flow_rule(priv->vlan.untagged_rule);
+			priv->vlan.untagged_rule = NULL;
+		}
 		break;
 	case MLX5E_VLAN_RULE_TYPE_ANY_VID:
-		mlx5_del_flow_table_entry(priv->ft.vlan,
-					  priv->vlan.any_vlan_rule_ft_ix);
+		if (priv->vlan.any_vlan_rule) {
+			mlx5_del_flow_rule(priv->vlan.any_vlan_rule);
+			priv->vlan.any_vlan_rule = NULL;
+		}
 		break;
 	case MLX5E_VLAN_RULE_TYPE_MATCH_VID:
-		mlx5_del_flow_table_entry(priv->ft.vlan,
-					  priv->vlan.active_vlans_ft_ix[vid]);
+		mlx5e_vport_context_update_vlans(priv);
+		if (priv->vlan.active_vlans_rule[vid]) {
+			mlx5_del_flow_rule(priv->vlan.active_vlans_rule[vid]);
+			priv->vlan.active_vlans_rule[vid] = NULL;
+		}
 		mlx5e_vport_context_update_vlans(priv);
 		break;
 	}
@@ -889,151 +860,358 @@ void mlx5e_set_rx_mode_work(struct work_struct *work)
 	mlx5e_vport_context_update(priv);
 }
 
+static void mlx5e_destroy_groups(struct mlx5e_flow_table *ft)
+{
+	int i;
+
+	for (i = ft->num_groups - 1; i >= 0; i--) {
+		if (!IS_ERR_OR_NULL(ft->g[i]))
+			mlx5_destroy_flow_group(ft->g[i]);
+		ft->g[i] = NULL;
+	}
+	ft->num_groups = 0;
+}
+
 void mlx5e_init_eth_addr(struct mlx5e_priv *priv)
 {
 	ether_addr_copy(priv->eth_addr.broadcast.addr, priv->netdev->broadcast);
 }
 
-static int mlx5e_create_main_flow_table(struct mlx5e_priv *priv)
+#define MLX5E_MAIN_GROUP0_SIZE	BIT(3)
+#define MLX5E_MAIN_GROUP1_SIZE	BIT(1)
+#define MLX5E_MAIN_GROUP2_SIZE	BIT(0)
+#define MLX5E_MAIN_GROUP3_SIZE	BIT(14)
+#define MLX5E_MAIN_GROUP4_SIZE	BIT(13)
+#define MLX5E_MAIN_GROUP5_SIZE	BIT(11)
+#define MLX5E_MAIN_GROUP6_SIZE	BIT(2)
+#define MLX5E_MAIN_GROUP7_SIZE	BIT(1)
+#define MLX5E_MAIN_GROUP8_SIZE	BIT(0)
+#define MLX5E_MAIN_TABLE_SIZE	(MLX5E_MAIN_GROUP0_SIZE +\
+				 MLX5E_MAIN_GROUP1_SIZE +\
+				 MLX5E_MAIN_GROUP2_SIZE +\
+				 MLX5E_MAIN_GROUP3_SIZE +\
+				 MLX5E_MAIN_GROUP4_SIZE +\
+				 MLX5E_MAIN_GROUP5_SIZE +\
+				 MLX5E_MAIN_GROUP6_SIZE +\
+				 MLX5E_MAIN_GROUP7_SIZE +\
+				 MLX5E_MAIN_GROUP8_SIZE)
+
+static int __mlx5e_create_main_groups(struct mlx5e_flow_table *ft, u32 *in,
+				      int inlen)
 {
-	struct mlx5_flow_table_group *g;
-	u8 *dmac;
+	u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	u8 *dmac = MLX5_ADDR_OF(create_flow_group_in, in,
+				match_criteria.outer_headers.dmac_47_16);
+	int err;
+	int ix = 0;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_MAIN_GROUP0_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_MAIN_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_MAIN_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol);
+	eth_broadcast_addr(dmac);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_MAIN_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
+	eth_broadcast_addr(dmac);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_MAIN_GROUP4_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	eth_broadcast_addr(dmac);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_MAIN_GROUP5_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol);
+	dmac[0] = 0x01;
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_MAIN_GROUP6_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
+	dmac[0] = 0x01;
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_MAIN_GROUP7_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	dmac[0] = 0x01;
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_MAIN_GROUP8_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	return 0;
+
+err_destroy_groups:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+	mlx5e_destroy_groups(ft);
+
+	return err;
+}
 
-	g = kcalloc(9, sizeof(*g), GFP_KERNEL);
-	if (!g)
+static int mlx5e_create_main_groups(struct mlx5e_flow_table *ft)
+{
+	u32 *in;
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	int err;
+
+	in = mlx5_vzalloc(inlen);
+	if (!in)
 		return -ENOMEM;
 
-	g[0].log_sz = 3;
-	g[0].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	MLX5_SET_TO_ONES(fte_match_param, g[0].match_criteria,
-			 outer_headers.ethertype);
-	MLX5_SET_TO_ONES(fte_match_param, g[0].match_criteria,
-			 outer_headers.ip_protocol);
-
-	g[1].log_sz = 1;
-	g[1].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	MLX5_SET_TO_ONES(fte_match_param, g[1].match_criteria,
-			 outer_headers.ethertype);
-
-	g[2].log_sz = 0;
-
-	g[3].log_sz = 14;
-	g[3].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	dmac = MLX5_ADDR_OF(fte_match_param, g[3].match_criteria,
-			    outer_headers.dmac_47_16);
-	memset(dmac, 0xff, ETH_ALEN);
-	MLX5_SET_TO_ONES(fte_match_param, g[3].match_criteria,
-			 outer_headers.ethertype);
-	MLX5_SET_TO_ONES(fte_match_param, g[3].match_criteria,
-			 outer_headers.ip_protocol);
-
-	g[4].log_sz = 13;
-	g[4].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	dmac = MLX5_ADDR_OF(fte_match_param, g[4].match_criteria,
-			    outer_headers.dmac_47_16);
-	memset(dmac, 0xff, ETH_ALEN);
-	MLX5_SET_TO_ONES(fte_match_param, g[4].match_criteria,
-			 outer_headers.ethertype);
-
-	g[5].log_sz = 11;
-	g[5].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	dmac = MLX5_ADDR_OF(fte_match_param, g[5].match_criteria,
-			    outer_headers.dmac_47_16);
-	memset(dmac, 0xff, ETH_ALEN);
-
-	g[6].log_sz = 2;
-	g[6].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	dmac = MLX5_ADDR_OF(fte_match_param, g[6].match_criteria,
-			    outer_headers.dmac_47_16);
-	dmac[0] = 0x01;
-	MLX5_SET_TO_ONES(fte_match_param, g[6].match_criteria,
-			 outer_headers.ethertype);
-	MLX5_SET_TO_ONES(fte_match_param, g[6].match_criteria,
-			 outer_headers.ip_protocol);
-
-	g[7].log_sz = 1;
-	g[7].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	dmac = MLX5_ADDR_OF(fte_match_param, g[7].match_criteria,
-			    outer_headers.dmac_47_16);
-	dmac[0] = 0x01;
-	MLX5_SET_TO_ONES(fte_match_param, g[7].match_criteria,
-			 outer_headers.ethertype);
+	err = __mlx5e_create_main_groups(ft, in, inlen);
 
-	g[8].log_sz = 0;
-	g[8].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	dmac = MLX5_ADDR_OF(fte_match_param, g[8].match_criteria,
-			    outer_headers.dmac_47_16);
-	dmac[0] = 0x01;
-	priv->ft.main = mlx5_create_flow_table(priv->mdev, 1,
-					       MLX5_FLOW_TABLE_TYPE_NIC_RCV,
-					       9, g);
-	kfree(g);
+	kvfree(in);
+	return err;
+}
 
-	return priv->ft.main ? 0 : -ENOMEM;
+static int mlx5e_create_main_flow_table(struct mlx5e_priv *priv)
+{
+	struct mlx5e_flow_table *ft = &priv->fts.main;
+	int err;
+
+	ft->num_groups = 0;
+	ft->t = mlx5_create_flow_table(priv->fts.ns, 0, MLX5E_MAIN_TABLE_SIZE);
+
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+	ft->g = kcalloc(MLX5E_NUM_MAIN_GROUPS, sizeof(*ft->g), GFP_KERNEL);
+	if (!ft->g) {
+		err = -ENOMEM;
+		goto err_destroy_main_flow_table;
+	}
+
+	err = mlx5e_create_main_groups(ft);
+	if (err)
+		goto err_free_g;
+	return 0;
+
+err_free_g:
+	kfree(ft->g);
+
+err_destroy_main_flow_table:
+	mlx5_destroy_flow_table(ft->t);
+	ft->t = NULL;
+
+	return err;
+}
+
+static void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft)
+{
+	mlx5e_destroy_groups(ft);
+	kfree(ft->g);
+	mlx5_destroy_flow_table(ft->t);
+	ft->t = NULL;
 }
 
 static void mlx5e_destroy_main_flow_table(struct mlx5e_priv *priv)
 {
-	mlx5_destroy_flow_table(priv->ft.main);
+	mlx5e_destroy_flow_table(&priv->fts.main);
 }
 
-static int mlx5e_create_vlan_flow_table(struct mlx5e_priv *priv)
+#define MLX5E_NUM_VLAN_GROUPS	2
+#define MLX5E_VLAN_GROUP0_SIZE	BIT(12)
+#define MLX5E_VLAN_GROUP1_SIZE	BIT(1)
+#define MLX5E_VLAN_TABLE_SIZE	(MLX5E_VLAN_GROUP0_SIZE +\
+				 MLX5E_VLAN_GROUP1_SIZE)
+
+static int __mlx5e_create_vlan_groups(struct mlx5e_flow_table *ft, u32 *in,
+				      int inlen)
+{
+	int err;
+	int ix = 0;
+	u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.vlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_VLAN_GROUP0_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.vlan_tag);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_VLAN_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	return 0;
+
+err_destroy_groups:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+	mlx5e_destroy_groups(ft);
+
+	return err;
+}
+
+static int mlx5e_create_vlan_groups(struct mlx5e_flow_table *ft)
 {
-	struct mlx5_flow_table_group *g;
+	u32 *in;
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	int err;
 
-	g = kcalloc(2, sizeof(*g), GFP_KERNEL);
-	if (!g)
+	in = mlx5_vzalloc(inlen);
+	if (!in)
 		return -ENOMEM;
 
-	g[0].log_sz = 12;
-	g[0].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	MLX5_SET_TO_ONES(fte_match_param, g[0].match_criteria,
-			 outer_headers.vlan_tag);
-	MLX5_SET_TO_ONES(fte_match_param, g[0].match_criteria,
-			 outer_headers.first_vid);
-
-	/* untagged + any vlan id */
-	g[1].log_sz = 1;
-	g[1].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	MLX5_SET_TO_ONES(fte_match_param, g[1].match_criteria,
-			 outer_headers.vlan_tag);
-
-	priv->ft.vlan = mlx5_create_flow_table(priv->mdev, 0,
-					       MLX5_FLOW_TABLE_TYPE_NIC_RCV,
-					       2, g);
-
-	kfree(g);
-	return priv->ft.vlan ? 0 : -ENOMEM;
+	err = __mlx5e_create_vlan_groups(ft, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+static int mlx5e_create_vlan_flow_table(struct mlx5e_priv *priv)
+{
+	struct mlx5e_flow_table *ft = &priv->fts.vlan;
+	int err;
+
+	ft->num_groups = 0;
+	ft->t = mlx5_create_flow_table(priv->fts.ns, 0, MLX5E_VLAN_TABLE_SIZE);
+
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+	ft->g = kcalloc(MLX5E_NUM_VLAN_GROUPS, sizeof(*ft->g), GFP_KERNEL);
+	if (!ft->g) {
+		err = -ENOMEM;
+		goto err_destroy_vlan_flow_table;
+	}
+
+	err = mlx5e_create_vlan_groups(ft);
+	if (err)
+		goto err_free_g;
+
+	return 0;
+
+err_free_g:
+	kfree(ft->g);
+
+err_destroy_vlan_flow_table:
+	mlx5_destroy_flow_table(ft->t);
+	ft->t = NULL;
+
+	return err;
 }
 
 static void mlx5e_destroy_vlan_flow_table(struct mlx5e_priv *priv)
 {
-	mlx5_destroy_flow_table(priv->ft.vlan);
+	mlx5e_destroy_flow_table(&priv->fts.vlan);
 }
 
 int mlx5e_create_flow_tables(struct mlx5e_priv *priv)
 {
 	int err;
 
-	err = mlx5e_create_main_flow_table(priv);
+	priv->fts.ns = mlx5_get_flow_namespace(priv->mdev,
+					       MLX5_FLOW_NAMESPACE_KERNEL);
+
+	if (!priv->fts.ns)
+		return -EINVAL;
+
+	err = mlx5e_create_vlan_flow_table(priv);
 	if (err)
 		return err;
 
-	err = mlx5e_create_vlan_flow_table(priv);
+	err = mlx5e_create_main_flow_table(priv);
 	if (err)
-		goto err_destroy_main_flow_table;
+		goto err_destroy_vlan_flow_table;
 
 	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
 	if (err)
-		goto err_destroy_vlan_flow_table;
+		goto err_destroy_main_flow_table;
 
 	return 0;
 
-err_destroy_vlan_flow_table:
-	mlx5e_destroy_vlan_flow_table(priv);
-
 err_destroy_main_flow_table:
 	mlx5e_destroy_main_flow_table(priv);
+err_destroy_vlan_flow_table:
+	mlx5e_destroy_vlan_flow_table(priv);
 
 	return err;
 }
@@ -1041,6 +1219,6 @@ err_destroy_main_flow_table:
 void mlx5e_destroy_flow_tables(struct mlx5e_priv *priv)
 {
 	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
-	mlx5e_destroy_vlan_flow_table(priv);
 	mlx5e_destroy_main_flow_table(priv);
+	mlx5e_destroy_vlan_flow_table(priv);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a20be56df553..d4601a564699 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -30,7 +30,7 @@
  * SOFTWARE.
  */
 
-#include <linux/mlx5/flow_table.h>
+#include <linux/mlx5/fs.h>
 #include "en.h"
 #include "eswitch.h"
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index d8939e597c54..bc3d9f8a75c1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -34,7 +34,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/mlx5_ifc.h>
 #include <linux/mlx5/vport.h>
-#include <linux/mlx5/flow_table.h>
+#include <linux/mlx5/fs.h>
 #include "mlx5_core.h"
 #include "eswitch.h"
 
@@ -321,220 +321,6 @@ static void del_l2_table_entry(struct mlx5_core_dev *dev, u32 index)
 	free_l2_table_index(l2_table, index);
 }
 
-/* E-Switch FDB flow steering */
-struct dest_node {
-	struct list_head list;
-	struct mlx5_flow_destination dest;
-};
-
-static int _mlx5_flow_rule_apply(struct mlx5_flow_rule *fr)
-{
-	bool was_valid = fr->valid;
-	struct dest_node *dest_n;
-	u32 dest_list_size = 0;
-	void *in_match_value;
-	u32 *flow_context;
-	u32 flow_index;
-	int err;
-	int i;
-
-	if (list_empty(&fr->dest_list)) {
-		if (fr->valid)
-			mlx5_del_flow_table_entry(fr->ft, fr->fi);
-		fr->valid = false;
-		return 0;
-	}
-
-	list_for_each_entry(dest_n, &fr->dest_list, list)
-		dest_list_size++;
-
-	flow_context = mlx5_vzalloc(MLX5_ST_SZ_BYTES(flow_context) +
-				    MLX5_ST_SZ_BYTES(dest_format_struct) *
-				    dest_list_size);
-	if (!flow_context)
-		return -ENOMEM;
-
-	MLX5_SET(flow_context, flow_context, flow_tag, fr->flow_tag);
-	MLX5_SET(flow_context, flow_context, action, fr->action);
-	MLX5_SET(flow_context, flow_context, destination_list_size,
-		 dest_list_size);
-
-	i = 0;
-	list_for_each_entry(dest_n, &fr->dest_list, list) {
-		void *dest_addr = MLX5_ADDR_OF(flow_context, flow_context,
-					       destination[i++]);
-
-		MLX5_SET(dest_format_struct, dest_addr, destination_type,
-			 dest_n->dest.type);
-		MLX5_SET(dest_format_struct, dest_addr, destination_id,
-			 dest_n->dest.vport_num);
-	}
-
-	in_match_value = MLX5_ADDR_OF(flow_context, flow_context, match_value);
-	memcpy(in_match_value, fr->match_value, MLX5_ST_SZ_BYTES(fte_match_param));
-
-	err = mlx5_add_flow_table_entry(fr->ft, fr->match_criteria_enable,
-					fr->match_criteria, flow_context,
-					&flow_index);
-	if (!err) {
-		if (was_valid)
-			mlx5_del_flow_table_entry(fr->ft, fr->fi);
-		fr->fi = flow_index;
-		fr->valid = true;
-	}
-	kfree(flow_context);
-	return err;
-}
-
-static int mlx5_flow_rule_add_dest(struct mlx5_flow_rule *fr,
-				   struct mlx5_flow_destination *new_dest)
-{
-	struct dest_node *dest_n;
-	int err;
-
-	dest_n = kzalloc(sizeof(*dest_n), GFP_KERNEL);
-	if (!dest_n)
-		return -ENOMEM;
-
-	memcpy(&dest_n->dest, new_dest, sizeof(dest_n->dest));
-	mutex_lock(&fr->mutex);
-	list_add(&dest_n->list, &fr->dest_list);
-	err = _mlx5_flow_rule_apply(fr);
-	if (err) {
-		list_del(&dest_n->list);
-		kfree(dest_n);
-	}
-	mutex_unlock(&fr->mutex);
-	return err;
-}
-
-static int mlx5_flow_rule_del_dest(struct mlx5_flow_rule *fr,
-				   struct mlx5_flow_destination *dest)
-{
-	struct dest_node *dest_n;
-	struct dest_node *n;
-	int err;
-
-	mutex_lock(&fr->mutex);
-	list_for_each_entry_safe(dest_n, n, &fr->dest_list, list) {
-		if (dest->vport_num == dest_n->dest.vport_num)
-			goto found;
-	}
-	mutex_unlock(&fr->mutex);
-	return -ENOENT;
-
-found:
-	list_del(&dest_n->list);
-	err = _mlx5_flow_rule_apply(fr);
-	mutex_unlock(&fr->mutex);
-	kfree(dest_n);
-
-	return err;
-}
-
-static struct mlx5_flow_rule *find_fr(struct mlx5_eswitch *esw,
-				      u8 match_criteria_enable,
-				      u32 *match_value)
-{
-	struct hlist_head *hash = esw->mc_table;
-	struct esw_mc_addr *esw_mc;
-	u8 *dmac_v;
-
-	dmac_v = MLX5_ADDR_OF(fte_match_param, match_value,
-			      outer_headers.dmac_47_16);
-
-	/* UNICAST FULL MATCH */
-	if (!is_multicast_ether_addr(dmac_v))
-		return NULL;
-
-	/* MULTICAST FULL MATCH */
-	esw_mc = l2addr_hash_find(hash, dmac_v, struct esw_mc_addr);
-
-	return esw_mc ? esw_mc->uplink_rule : NULL;
-}
-
-static struct mlx5_flow_rule *alloc_fr(void *ft,
-				       u8 match_criteria_enable,
-				       u32 *match_criteria,
-				       u32 *match_value,
-				       u32 action,
-				       u32 flow_tag)
-{
-	struct mlx5_flow_rule *fr = kzalloc(sizeof(*fr), GFP_KERNEL);
-
-	if (!fr)
-		return NULL;
-
-	fr->match_criteria = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
-	fr->match_value = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
-	if (!fr->match_criteria || !fr->match_value) {
-		kfree(fr->match_criteria);
-		kfree(fr->match_value);
-		kfree(fr);
-		return NULL;
-	}
-
-	memcpy(fr->match_criteria, match_criteria, MLX5_ST_SZ_BYTES(fte_match_param));
-	memcpy(fr->match_value, match_value, MLX5_ST_SZ_BYTES(fte_match_param));
-	fr->match_criteria_enable = match_criteria_enable;
-	fr->flow_tag = flow_tag;
-	fr->action = action;
-
-	mutex_init(&fr->mutex);
-	INIT_LIST_HEAD(&fr->dest_list);
-	atomic_set(&fr->refcount, 0);
-	fr->ft = ft;
-	return fr;
-}
-
-static void deref_fr(struct mlx5_flow_rule *fr)
-{
-	if (!atomic_dec_and_test(&fr->refcount))
-		return;
-
-	kfree(fr->match_criteria);
-	kfree(fr->match_value);
-	kfree(fr);
-}
-
-static struct mlx5_flow_rule *
-mlx5_add_flow_rule(struct mlx5_eswitch *esw,
-		   u8 match_criteria_enable,
-		   u32 *match_criteria,
-		   u32 *match_value,
-		   u32 action,
-		   u32 flow_tag,
-		   struct mlx5_flow_destination *dest)
-{
-	struct mlx5_flow_rule *fr;
-	int err;
-
-	fr = find_fr(esw, match_criteria_enable, match_value);
-	fr = fr ? fr : alloc_fr(esw->fdb_table.fdb, match_criteria_enable, match_criteria,
-				match_value, action, flow_tag);
-	if (!fr)
-		return NULL;
-
-	atomic_inc(&fr->refcount);
-
-	err = mlx5_flow_rule_add_dest(fr, dest);
-	if (err) {
-		deref_fr(fr);
-		return NULL;
-	}
-
-	return fr;
-}
-
-static void mlx5_del_flow_rule(struct mlx5_flow_rule *fr, u32 vport)
-{
-	struct mlx5_flow_destination dest;
-
-	dest.vport_num = vport;
-	mlx5_flow_rule_del_dest(fr, &dest);
-	deref_fr(fr);
-}
-
 /* E-Switch FDB */
 static struct mlx5_flow_rule *
 esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u8 mac[ETH_ALEN], u32 vport)
@@ -569,7 +355,7 @@ esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u8 mac[ETH_ALEN], u32 vport)
 		  "\tFDB add rule dmac_v(%pM) dmac_c(%pM) -> vport(%d)\n",
 		  dmac_v, dmac_c, vport);
 	flow_rule =
-		mlx5_add_flow_rule(esw,
+		mlx5_add_flow_rule(esw->fdb_table.fdb,
 				   match_header,
 				   match_c,
 				   match_v,
@@ -589,33 +375,61 @@ out:
 
 static int esw_create_fdb_table(struct mlx5_eswitch *esw, int nvports)
 {
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
 	struct mlx5_core_dev *dev = esw->dev;
-	struct mlx5_flow_table_group g;
+	struct mlx5_flow_namespace *root_ns;
 	struct mlx5_flow_table *fdb;
+	struct mlx5_flow_group *g;
+	void *match_criteria;
+	int table_size;
+	u32 *flow_group_in;
 	u8 *dmac;
+	int err = 0;
 
 	esw_debug(dev, "Create FDB log_max_size(%d)\n",
 		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
 
-	memset(&g, 0, sizeof(g));
-	/* UC MC Full match rules*/
-	g.log_sz = MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size);
-	g.match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	dmac = MLX5_ADDR_OF(fte_match_param, g.match_criteria,
-			    outer_headers.dmac_47_16);
-	/* Match criteria mask */
-	memset(dmac, 0xff, 6);
+	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get FDB flow namespace\n");
+		return -ENOMEM;
+	}
 
-	fdb = mlx5_create_flow_table(dev, 0,
-				     MLX5_FLOW_TABLE_TYPE_ESWITCH,
-				     1, &g);
-	if (fdb)
-		esw_debug(dev, "ESW: FDB Table created fdb->id %d\n", mlx5_get_flow_table_id(fdb));
-	else
-		esw_warn(dev, "ESW: Failed to create FDB Table\n");
+	flow_group_in = mlx5_vzalloc(inlen);
+	if (!flow_group_in)
+		return -ENOMEM;
+	memset(flow_group_in, 0, inlen);
+
+	table_size = BIT(MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+	fdb = mlx5_create_flow_table(root_ns, 0, table_size);
+	if (IS_ERR_OR_NULL(fdb)) {
+		err = PTR_ERR(fdb);
+		esw_warn(dev, "Failed to create FDB Table err %d\n", err);
+		goto out;
+	}
 
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_OUTER_HEADERS);
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+	dmac = MLX5_ADDR_OF(fte_match_param, match_criteria, outer_headers.dmac_47_16);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 1);
+	eth_broadcast_addr(dmac);
+
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR_OR_NULL(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create flow group err(%d)\n", err);
+		goto out;
+	}
+
+	esw->fdb_table.addr_grp = g;
 	esw->fdb_table.fdb = fdb;
-	return fdb ? 0 : -ENOMEM;
+out:
+	kfree(flow_group_in);
+	if (err && !IS_ERR_OR_NULL(fdb))
+		mlx5_destroy_flow_table(fdb);
+	return err;
 }
 
 static void esw_destroy_fdb_table(struct mlx5_eswitch *esw)
@@ -623,10 +437,11 @@ static void esw_destroy_fdb_table(struct mlx5_eswitch *esw)
 	if (!esw->fdb_table.fdb)
 		return;
 
-	esw_debug(esw->dev, "Destroy FDB Table fdb(%d)\n",
-		  mlx5_get_flow_table_id(esw->fdb_table.fdb));
+	esw_debug(esw->dev, "Destroy FDB Table\n");
+	mlx5_destroy_flow_group(esw->fdb_table.addr_grp);
 	mlx5_destroy_flow_table(esw->fdb_table.fdb);
 	esw->fdb_table.fdb = NULL;
+	esw->fdb_table.addr_grp = NULL;
 }
 
 /* E-Switch vport UC/MC lists management */
@@ -689,7 +504,7 @@ static int esw_del_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 	del_l2_table_entry(esw->dev, esw_uc->table_index);
 
 	if (vaddr->flow_rule)
-		mlx5_del_flow_rule(vaddr->flow_rule, vport);
+		mlx5_del_flow_rule(vaddr->flow_rule);
 	vaddr->flow_rule = NULL;
 
 	l2addr_hash_del(esw_uc);
@@ -750,14 +565,14 @@ static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 		  esw_mc->uplink_rule);
 
 	if (vaddr->flow_rule)
-		mlx5_del_flow_rule(vaddr->flow_rule, vport);
+		mlx5_del_flow_rule(vaddr->flow_rule);
 	vaddr->flow_rule = NULL;
 
 	if (--esw_mc->refcnt)
 		return 0;
 
 	if (esw_mc->uplink_rule)
-		mlx5_del_flow_rule(esw_mc->uplink_rule, UPLINK_VPORT);
+		mlx5_del_flow_rule(esw_mc->uplink_rule);
 
 	l2addr_hash_del(esw_mc);
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 02ff3eade026..3416a428f70f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -88,20 +88,6 @@ struct l2addr_node {
 	kfree(ptr);                                         \
 })
 
-struct mlx5_flow_rule {
-	void             *ft;
-	u32              fi;
-	u8               match_criteria_enable;
-	u32              *match_criteria;
-	u32              *match_value;
-	u32              action;
-	u32              flow_tag;
-	bool             valid;
-	atomic_t         refcount;
-	struct mutex     mutex; /* protect flow rule updates */
-	struct list_head dest_list;
-};
-
 struct mlx5_vport {
 	struct mlx5_core_dev    *dev;
 	int                     vport;
@@ -126,6 +112,7 @@ struct mlx5_l2_table {
 
 struct mlx5_eswitch_fdb {
 	void *fdb;
+	struct mlx5_flow_group *addr_grp;
 };
 
 struct mlx5_eswitch {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/flow_table.c b/drivers/net/ethernet/mellanox/mlx5/core/flow_table.c
deleted file mode 100644
index ca90b9bc3b95..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/flow_table.c
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/export.h>
-#include <linux/mlx5/driver.h>
-#include <linux/mlx5/flow_table.h>
-#include "mlx5_core.h"
-
-struct mlx5_ftg {
-	struct mlx5_flow_table_group    g;
-	u32				id;
-	u32				start_ix;
-};
-
-struct mlx5_flow_table {
-	struct mlx5_core_dev	*dev;
-	u8			level;
-	u8			type;
-	u32			id;
-	struct mutex		mutex; /* sync bitmap alloc */
-	u16			num_groups;
-	struct mlx5_ftg		*group;
-	unsigned long		*bitmap;
-	u32			size;
-};
-
-static int mlx5_set_flow_entry_cmd(struct mlx5_flow_table *ft, u32 group_ix,
-				   u32 flow_index, void *flow_context)
-{
-	u32 out[MLX5_ST_SZ_DW(set_fte_out)];
-	u32 *in;
-	void *in_flow_context;
-	int fcdls =
-		MLX5_GET(flow_context, flow_context, destination_list_size) *
-		MLX5_ST_SZ_BYTES(dest_format_struct);
-	int inlen = MLX5_ST_SZ_BYTES(set_fte_in) + fcdls;
-	int err;
-
-	in = mlx5_vzalloc(inlen);
-	if (!in) {
-		mlx5_core_warn(ft->dev, "failed to allocate inbox\n");
-		return -ENOMEM;
-	}
-
-	MLX5_SET(set_fte_in, in, table_type, ft->type);
-	MLX5_SET(set_fte_in, in, table_id,   ft->id);
-	MLX5_SET(set_fte_in, in, flow_index, flow_index);
-	MLX5_SET(set_fte_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY);
-
-	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
-	memcpy(in_flow_context, flow_context,
-	       MLX5_ST_SZ_BYTES(flow_context) + fcdls);
-
-	MLX5_SET(flow_context, in_flow_context, group_id,
-		 ft->group[group_ix].id);
-
-	memset(out, 0, sizeof(out));
-	err = mlx5_cmd_exec_check_status(ft->dev, in, inlen, out,
-					 sizeof(out));
-	kvfree(in);
-
-	return err;
-}
-
-static void mlx5_del_flow_entry_cmd(struct mlx5_flow_table *ft, u32 flow_index)
-{
-	u32 in[MLX5_ST_SZ_DW(delete_fte_in)];
-	u32 out[MLX5_ST_SZ_DW(delete_fte_out)];
-
-	memset(in, 0, sizeof(in));
-	memset(out, 0, sizeof(out));
-
-#define MLX5_SET_DFTEI(p, x, v) MLX5_SET(delete_fte_in, p, x, v)
-	MLX5_SET_DFTEI(in, table_type, ft->type);
-	MLX5_SET_DFTEI(in, table_id,   ft->id);
-	MLX5_SET_DFTEI(in, flow_index, flow_index);
-	MLX5_SET_DFTEI(in, opcode,     MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY);
-
-	mlx5_cmd_exec_check_status(ft->dev, in, sizeof(in), out, sizeof(out));
-}
-
-static void mlx5_destroy_flow_group_cmd(struct mlx5_flow_table *ft, int i)
-{
-	u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)];
-	u32 out[MLX5_ST_SZ_DW(destroy_flow_group_out)];
-
-	memset(in, 0, sizeof(in));
-	memset(out, 0, sizeof(out));
-
-#define MLX5_SET_DFGI(p, x, v) MLX5_SET(destroy_flow_group_in, p, x, v)
-	MLX5_SET_DFGI(in, table_type, ft->type);
-	MLX5_SET_DFGI(in, table_id,   ft->id);
-	MLX5_SET_DFGI(in, opcode, MLX5_CMD_OP_DESTROY_FLOW_GROUP);
-	MLX5_SET_DFGI(in, group_id, ft->group[i].id);
-	mlx5_cmd_exec_check_status(ft->dev, in, sizeof(in), out, sizeof(out));
-}
-
-static int mlx5_create_flow_group_cmd(struct mlx5_flow_table *ft, int i)
-{
-	u32 out[MLX5_ST_SZ_DW(create_flow_group_out)];
-	u32 *in;
-	void *in_match_criteria;
-	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
-	struct mlx5_flow_table_group *g = &ft->group[i].g;
-	u32 start_ix = ft->group[i].start_ix;
-	u32 end_ix = start_ix + (1 << g->log_sz) - 1;
-	int err;
-
-	in = mlx5_vzalloc(inlen);
-	if (!in) {
-		mlx5_core_warn(ft->dev, "failed to allocate inbox\n");
-		return -ENOMEM;
-	}
-	in_match_criteria = MLX5_ADDR_OF(create_flow_group_in, in,
-					 match_criteria);
-
-	memset(out, 0, sizeof(out));
-
-#define MLX5_SET_CFGI(p, x, v) MLX5_SET(create_flow_group_in, p, x, v)
-	MLX5_SET_CFGI(in, table_type,            ft->type);
-	MLX5_SET_CFGI(in, table_id,              ft->id);
-	MLX5_SET_CFGI(in, opcode,                MLX5_CMD_OP_CREATE_FLOW_GROUP);
-	MLX5_SET_CFGI(in, start_flow_index,      start_ix);
-	MLX5_SET_CFGI(in, end_flow_index,        end_ix);
-	MLX5_SET_CFGI(in, match_criteria_enable, g->match_criteria_enable);
-
-	memcpy(in_match_criteria, g->match_criteria,
-	       MLX5_ST_SZ_BYTES(fte_match_param));
-
-	err = mlx5_cmd_exec_check_status(ft->dev, in, inlen, out,
-					 sizeof(out));
-	if (!err)
-		ft->group[i].id = MLX5_GET(create_flow_group_out, out,
-					   group_id);
-
-	kvfree(in);
-
-	return err;
-}
-
-static void mlx5_destroy_flow_table_groups(struct mlx5_flow_table *ft)
-{
-	int i;
-
-	for (i = 0; i < ft->num_groups; i++)
-		mlx5_destroy_flow_group_cmd(ft, i);
-}
-
-static int mlx5_create_flow_table_groups(struct mlx5_flow_table *ft)
-{
-	int err;
-	int i;
-
-	for (i = 0; i < ft->num_groups; i++) {
-		err = mlx5_create_flow_group_cmd(ft, i);
-		if (err)
-			goto err_destroy_flow_table_groups;
-	}
-
-	return 0;
-
-err_destroy_flow_table_groups:
-	for (i--; i >= 0; i--)
-		mlx5_destroy_flow_group_cmd(ft, i);
-
-	return err;
-}
-
-static int mlx5_create_flow_table_cmd(struct mlx5_flow_table *ft)
-{
-	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)];
-	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)];
-	int err;
-
-	memset(in, 0, sizeof(in));
-
-	MLX5_SET(create_flow_table_in, in, table_type, ft->type);
-	MLX5_SET(create_flow_table_in, in, level,      ft->level);
-	MLX5_SET(create_flow_table_in, in, log_size,   order_base_2(ft->size));
-
-	MLX5_SET(create_flow_table_in, in, opcode,
-		 MLX5_CMD_OP_CREATE_FLOW_TABLE);
-
-	memset(out, 0, sizeof(out));
-	err = mlx5_cmd_exec_check_status(ft->dev, in, sizeof(in), out,
-					 sizeof(out));
-	if (err)
-		return err;
-
-	ft->id = MLX5_GET(create_flow_table_out, out, table_id);
-
-	return 0;
-}
-
-static void mlx5_destroy_flow_table_cmd(struct mlx5_flow_table *ft)
-{
-	u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)];
-	u32 out[MLX5_ST_SZ_DW(destroy_flow_table_out)];
-
-	memset(in, 0, sizeof(in));
-	memset(out, 0, sizeof(out));
-
-#define MLX5_SET_DFTI(p, x, v) MLX5_SET(destroy_flow_table_in, p, x, v)
-	MLX5_SET_DFTI(in, table_type, ft->type);
-	MLX5_SET_DFTI(in, table_id,   ft->id);
-	MLX5_SET_DFTI(in, opcode, MLX5_CMD_OP_DESTROY_FLOW_TABLE);
-
-	mlx5_cmd_exec_check_status(ft->dev, in, sizeof(in), out, sizeof(out));
-}
-
-static int mlx5_find_group(struct mlx5_flow_table *ft, u8 match_criteria_enable,
-			   u32 *match_criteria, int *group_ix)
-{
-	void *mc_outer = MLX5_ADDR_OF(fte_match_param, match_criteria,
-				      outer_headers);
-	void *mc_misc  = MLX5_ADDR_OF(fte_match_param, match_criteria,
-				      misc_parameters);
-	void *mc_inner = MLX5_ADDR_OF(fte_match_param, match_criteria,
-				      inner_headers);
-	int mc_outer_sz = MLX5_ST_SZ_BYTES(fte_match_set_lyr_2_4);
-	int mc_misc_sz  = MLX5_ST_SZ_BYTES(fte_match_set_misc);
-	int mc_inner_sz = MLX5_ST_SZ_BYTES(fte_match_set_lyr_2_4);
-	int i;
-
-	for (i = 0; i < ft->num_groups; i++) {
-		struct mlx5_flow_table_group *g = &ft->group[i].g;
-		void *gmc_outer = MLX5_ADDR_OF(fte_match_param,
-					       g->match_criteria,
-					       outer_headers);
-		void *gmc_misc  = MLX5_ADDR_OF(fte_match_param,
-					       g->match_criteria,
-					       misc_parameters);
-		void *gmc_inner = MLX5_ADDR_OF(fte_match_param,
-					       g->match_criteria,
-					       inner_headers);
-
-		if (g->match_criteria_enable != match_criteria_enable)
-			continue;
-
-		if (match_criteria_enable & MLX5_MATCH_OUTER_HEADERS)
-			if (memcmp(mc_outer, gmc_outer, mc_outer_sz))
-				continue;
-
-		if (match_criteria_enable & MLX5_MATCH_MISC_PARAMETERS)
-			if (memcmp(mc_misc, gmc_misc, mc_misc_sz))
-				continue;
-
-		if (match_criteria_enable & MLX5_MATCH_INNER_HEADERS)
-			if (memcmp(mc_inner, gmc_inner, mc_inner_sz))
-				continue;
-
-		*group_ix = i;
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-static int alloc_flow_index(struct mlx5_flow_table *ft, int group_ix, u32 *ix)
-{
-	struct mlx5_ftg *g = &ft->group[group_ix];
-	int err = 0;
-
-	mutex_lock(&ft->mutex);
-
-	*ix = find_next_zero_bit(ft->bitmap, ft->size, g->start_ix);
-	if (*ix >= (g->start_ix + (1 << g->g.log_sz)))
-		err = -ENOSPC;
-	else
-		__set_bit(*ix, ft->bitmap);
-
-	mutex_unlock(&ft->mutex);
-
-	return err;
-}
-
-static void mlx5_free_flow_index(struct mlx5_flow_table *ft, u32 ix)
-{
-	__clear_bit(ix, ft->bitmap);
-}
-
-int mlx5_add_flow_table_entry(void *flow_table, u8 match_criteria_enable,
-			      void *match_criteria, void *flow_context,
-			      u32 *flow_index)
-{
-	struct mlx5_flow_table *ft = flow_table;
-	int group_ix;
-	int err;
-
-	err = mlx5_find_group(ft, match_criteria_enable, match_criteria,
-			      &group_ix);
-	if (err) {
-		mlx5_core_warn(ft->dev, "mlx5_find_group failed\n");
-		return err;
-	}
-
-	err = alloc_flow_index(ft, group_ix, flow_index);
-	if (err) {
-		mlx5_core_warn(ft->dev, "alloc_flow_index failed\n");
-		return err;
-	}
-
-	return mlx5_set_flow_entry_cmd(ft, group_ix, *flow_index, flow_context);
-}
-EXPORT_SYMBOL(mlx5_add_flow_table_entry);
-
-void mlx5_del_flow_table_entry(void *flow_table, u32 flow_index)
-{
-	struct mlx5_flow_table *ft = flow_table;
-
-	mlx5_del_flow_entry_cmd(ft, flow_index);
-	mlx5_free_flow_index(ft, flow_index);
-}
-EXPORT_SYMBOL(mlx5_del_flow_table_entry);
-
-void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type,
-			     u16 num_groups,
-			     struct mlx5_flow_table_group *group)
-{
-	struct mlx5_flow_table *ft;
-	u32 start_ix = 0;
-	u32 ft_size = 0;
-	void *gr;
-	void *bm;
-	int err;
-	int i;
-
-	for (i = 0; i < num_groups; i++)
-		ft_size += (1 << group[i].log_sz);
-
-	ft = kzalloc(sizeof(*ft), GFP_KERNEL);
-	gr = kcalloc(num_groups, sizeof(struct mlx5_ftg), GFP_KERNEL);
-	bm = kcalloc(BITS_TO_LONGS(ft_size), sizeof(uintptr_t), GFP_KERNEL);
-	if (!ft || !gr || !bm)
-		goto err_free_ft;
-
-	ft->group	= gr;
-	ft->bitmap	= bm;
-	ft->num_groups	= num_groups;
-	ft->level	= level;
-	ft->type	= table_type;
-	ft->size	= ft_size;
-	ft->dev		= dev;
-	mutex_init(&ft->mutex);
-
-	for (i = 0; i < ft->num_groups; i++) {
-		memcpy(&ft->group[i].g, &group[i], sizeof(*group));
-		ft->group[i].start_ix = start_ix;
-		start_ix += 1 << group[i].log_sz;
-	}
-
-	err = mlx5_create_flow_table_cmd(ft);
-	if (err)
-		goto err_free_ft;
-
-	err = mlx5_create_flow_table_groups(ft);
-	if (err)
-		goto err_destroy_flow_table_cmd;
-
-	return ft;
-
-err_destroy_flow_table_cmd:
-	mlx5_destroy_flow_table_cmd(ft);
-
-err_free_ft:
-	mlx5_core_warn(dev, "failed to alloc flow table\n");
-	kfree(bm);
-	kfree(gr);
-	kfree(ft);
-
-	return NULL;
-}
-EXPORT_SYMBOL(mlx5_create_flow_table);
-
-void mlx5_destroy_flow_table(void *flow_table)
-{
-	struct mlx5_flow_table *ft = flow_table;
-
-	mlx5_destroy_flow_table_groups(ft);
-	mlx5_destroy_flow_table_cmd(ft);
-	kfree(ft->bitmap);
-	kfree(ft->group);
-	kfree(ft);
-}
-EXPORT_SYMBOL(mlx5_destroy_flow_table);
-
-u32 mlx5_get_flow_table_id(void *flow_table)
-{
-	struct mlx5_flow_table *ft = flow_table;
-
-	return ft->id;
-}
-EXPORT_SYMBOL(mlx5_get_flow_table_id);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 4264e8b34b76..f7d62fe595f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -436,9 +436,9 @@ static struct mlx5_flow_table *alloc_flow_table(int level, int max_fte,
 	return ft;
 }
 
-static struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
-						      int prio,
-						      int max_fte)
+struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+					       int prio,
+					       int max_fte)
 {
 	struct mlx5_flow_table *ft;
 	int err;
@@ -491,8 +491,8 @@ unlock_prio:
 	return ERR_PTR(err);
 }
 
-static struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
-						      u32 *fg_in)
+struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
+					       u32 *fg_in)
 {
 	struct mlx5_flow_group *fg;
 	struct mlx5_core_dev *dev = get_dev(&ft->node);
@@ -669,7 +669,7 @@ unlock_fg:
 	return rule;
 }
 
-static struct mlx5_flow_rule *
+struct mlx5_flow_rule *
 mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 		   u8 match_criteria_enable,
 		   u32 *match_criteria,
@@ -699,12 +699,12 @@ put:
 	return rule;
 }
 
-static void mlx5_del_flow_rule(struct mlx5_flow_rule *rule)
+void mlx5_del_flow_rule(struct mlx5_flow_rule *rule)
 {
 	tree_remove_node(&rule->node);
 }
 
-static int mlx5_destroy_flow_table(struct mlx5_flow_table *ft)
+int mlx5_destroy_flow_table(struct mlx5_flow_table *ft)
 {
 	if (tree_remove_node(&ft->node))
 		mlx5_core_warn(get_dev(&ft->node), "Flow table %d wasn't destroyed, refcount > 1\n",
@@ -713,15 +713,15 @@ static int mlx5_destroy_flow_table(struct mlx5_flow_table *ft)
 	return 0;
 }
 
-static void mlx5_destroy_flow_group(struct mlx5_flow_group *fg)
+void mlx5_destroy_flow_group(struct mlx5_flow_group *fg)
 {
 	if (tree_remove_node(&fg->node))
 		mlx5_core_warn(get_dev(&fg->node), "Flow group %d wasn't destroyed, refcount > 1\n",
 			       fg->id);
 }
 
-static struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
-							   enum mlx5_flow_namespace_type type)
+struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
+						    enum mlx5_flow_namespace_type type)
 {
 	struct mlx5_flow_root_namespace *root_ns = dev->priv.root_ns;
 	int prio;
@@ -867,7 +867,7 @@ static struct mlx5_flow_root_namespace *create_root_ns(struct mlx5_core_dev *dev
 	struct mlx5_flow_root_namespace *root_ns;
 	struct mlx5_flow_namespace *ns;
 
-	/* create the root namespace */
+	/* Create the root namespace */
 	root_ns = mlx5_vzalloc(sizeof(*root_ns));
 	if (!root_ns)
 		return NULL;
@@ -1018,7 +1018,7 @@ static int init_fdb_root_ns(struct mlx5_core_dev *dev)
 	if (!dev->priv.fdb_root_ns)
 		return -ENOMEM;
 
-	/* create 1 prio*/
+	/* Create single prio */
 	prio = fs_create_prio(&dev->priv.fdb_root_ns->ns, 0, 1, 0);
 	if (IS_ERR(prio)) {
 		cleanup_single_prio_root_ns(dev, dev->priv.fdb_root_ns);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c6de3240f76f..789882b7b711 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -49,6 +49,7 @@
 #include <linux/delay.h>
 #include <linux/mlx5/mlx5_ifc.h>
 #include "mlx5_core.h"
+#include "fs_core.h"
 #ifdef CONFIG_MLX5_CORE_EN
 #include "eswitch.h"
 #endif
@@ -1055,6 +1056,11 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	mlx5_init_srq_table(dev);
 	mlx5_init_mr_table(dev);
 
+	err = mlx5_init_fs(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init flow steering\n");
+		goto err_fs;
+	}
 #ifdef CONFIG_MLX5_CORE_EN
 	err = mlx5_eswitch_init(dev);
 	if (err) {
@@ -1093,6 +1099,8 @@ err_sriov:
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 #endif
 err_reg_dev:
+	mlx5_cleanup_fs(dev);
+err_fs:
 	mlx5_cleanup_mr_table(dev);
 	mlx5_cleanup_srq_table(dev);
 	mlx5_cleanup_qp_table(dev);
@@ -1165,6 +1173,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 #endif
 
+	mlx5_cleanup_fs(dev);
 	mlx5_cleanup_mr_table(dev);
 	mlx5_cleanup_srq_table(dev);
 	mlx5_cleanup_qp_table(dev);
diff --git a/include/linux/mlx5/flow_table.h b/include/linux/mlx5/flow_table.h
deleted file mode 100644
index 0f2a15cf3317..000000000000
--- a/include/linux/mlx5/flow_table.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef MLX5_FLOW_TABLE_H
-#define MLX5_FLOW_TABLE_H
-
-#include <linux/mlx5/driver.h>
-
-struct mlx5_flow_table_group {
-	u8	log_sz;
-	u8	match_criteria_enable;
-	u32	match_criteria[MLX5_ST_SZ_DW(fte_match_param)];
-};
-
-struct mlx5_flow_destination {
-	enum mlx5_flow_destination_type	type;
-	union {
-		u32			tir_num;
-		void			*ft;
-		u32			vport_num;
-	};
-};
-
-void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type,
-			     u16 num_groups,
-			     struct mlx5_flow_table_group *group);
-void mlx5_destroy_flow_table(void *flow_table);
-int mlx5_add_flow_table_entry(void *flow_table, u8 match_criteria_enable,
-			      void *match_criteria, void *flow_context,
-			      u32 *flow_index);
-void mlx5_del_flow_table_entry(void *flow_table, u32 flow_index);
-u32 mlx5_get_flow_table_id(void *flow_table);
-
-#endif /* MLX5_FLOW_TABLE_H */
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 16ae5233dc7b..bc7ad019afde 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -33,6 +33,7 @@
 #ifndef _MLX5_FS_
 #define _MLX5_FS_
 
+#include <linux/mlx5/driver.h>
 #include <linux/mlx5/mlx5_ifc.h>
 
 #define MLX5_FS_DEFAULT_FLOW_TAG 0x0
@@ -43,6 +44,9 @@ enum mlx5_flow_namespace_type {
 };
 
 struct mlx5_flow_table;
+struct mlx5_flow_group;
+struct mlx5_flow_rule;
+struct mlx5_flow_namespace;
 
 struct mlx5_flow_destination {
 	enum mlx5_flow_destination_type	type;
@@ -52,4 +56,38 @@ struct mlx5_flow_destination {
 		u32			vport_num;
 	};
 };
+
+struct mlx5_flow_namespace *
+mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
+			enum mlx5_flow_namespace_type type);
+
+struct mlx5_flow_table *
+mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+		       int prio,
+		       int num_flow_table_entries);
+int mlx5_destroy_flow_table(struct mlx5_flow_table *ft);
+
+/* inbox should be set with the following values:
+ * start_flow_index
+ * end_flow_index
+ * match_criteria_enable
+ * match_criteria
+ */
+struct mlx5_flow_group *
+mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *in);
+void mlx5_destroy_flow_group(struct mlx5_flow_group *fg);
+
+/* Single destination per rule.
+ * Group ID is implied by the match criteria.
+ */
+struct mlx5_flow_rule *
+mlx5_add_flow_rule(struct mlx5_flow_table *ft,
+		   u8 match_criteria_enable,
+		   u32 *match_criteria,
+		   u32 *match_value,
+		   u32 action,
+		   u32 flow_tag,
+		   struct mlx5_flow_destination *dest);
+void mlx5_del_flow_rule(struct mlx5_flow_rule *fr);
+
 #endif
-- 
cgit v1.2.3-71-gd317