From 8b0a9d42301e45d501d751074a6f767fded680b1 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 29 Jun 2015 11:16:25 +0200
Subject: virtio_net: document VIRTIO_NET_CTRL_GUEST_OFFLOADS

Document VIRTIO_NET_CTRL_GUEST_OFFLOADS and the
relevant feature bits.

Will allow ethtool control of the offloads down the road.

Reported-by: Yan Vugenfirer <yan@daynix.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_net.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 7bbee79ca293..ec32293a00db 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -34,6 +34,7 @@
 /* The feature bitmap for virtio net */
 #define VIRTIO_NET_F_CSUM	0	/* Host handles pkts w/ partial csum */
 #define VIRTIO_NET_F_GUEST_CSUM	1	/* Guest handles pkts w/ partial csum */
+#define VIRTIO_NET_F_CTRL_GUEST_OFFLOADS 2 /* Dynamic offload configuration. */
 #define VIRTIO_NET_F_MAC	5	/* Host has given MAC address. */
 #define VIRTIO_NET_F_GUEST_TSO4	7	/* Guest can handle TSOv4 in. */
 #define VIRTIO_NET_F_GUEST_TSO6	8	/* Guest can handle TSOv6 in. */
@@ -226,4 +227,19 @@ struct virtio_net_ctrl_mq {
  #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN        1
  #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX        0x8000
 
+/*
+ * Control network offloads
+ *
+ * Reconfigures the network offloads that Guest can handle.
+ *
+ * Available with the VIRTIO_NET_F_CTRL_GUEST_OFFLOADS feature bit.
+ *
+ * Command data format matches the feature bit mask exactly.
+ *
+ * See VIRTIO_NET_F_GUEST_* for the list of offloads
+ * that can be enabled/disabled.
+ */
+#define VIRTIO_NET_CTRL_GUEST_OFFLOADS   5
+#define VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET        0
+
 #endif /* _LINUX_VIRTIO_NET_H */
-- 
cgit v1.2.3-71-gd317


From 9c3a473220dda073603a24e93d3015322bda950a Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Mon, 29 Jun 2015 17:36:45 +0100
Subject: ASoC: topology: fix typos in topology header

Signed-off-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index 12215205ab8d..785c5ca0994b 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -110,7 +110,7 @@
 
 /*
  * Block Header.
- * This header preceeds all object and object arrays below.
+ * This header precedes all object and object arrays below.
  */
 struct snd_soc_tplg_hdr {
 	__le32 magic;		/* magic number */
@@ -222,7 +222,7 @@ struct snd_soc_tplg_stream_config {
 /*
  * Manifest. List totals for each payload type. Not used in parsing, but will
  * be passed to the component driver before any other objects in order for any
- * global componnent resource allocations.
+ * global component resource allocations.
  *
  * File block representation for manifest :-
  * +-----------------------------------+----+
-- 
cgit v1.2.3-71-gd317


From d768f32aec8c0ebb8499ffca89cfed8f5f1a4432 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Thu, 2 Jul 2015 09:21:22 +0200
Subject: virtio: Fix typecast of pointer in vring_init()

The virtio_ring.h header is used in userspace programs (ie. QEMU),
too. Here we can not assume that sizeof(pointer) is the same as
sizeof(long), e.g. when compiling for Windows, so the typecast in
vring_init() should be done with (uintptr_t) instead of (unsigned long).

Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_ring.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 915980ac68df..c07295969b7e 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -31,6 +31,9 @@
  * SUCH DAMAGE.
  *
  * Copyright Rusty Russell IBM Corporation 2007. */
+#ifndef __KERNEL__
+#include <stdint.h>
+#endif
 #include <linux/types.h>
 #include <linux/virtio_types.h>
 
@@ -143,7 +146,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p,
 	vr->num = num;
 	vr->desc = p;
 	vr->avail = p + num*sizeof(struct vring_desc);
-	vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(__virtio16)
+	vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16)
 		+ align-1) & ~(align - 1));
 }
 
-- 
cgit v1.2.3-71-gd317


From 3121bb023e2db4f00ed6678898c09e35ed4b5301 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 2 Jul 2015 10:56:49 +0200
Subject: virtio: define virtio_pci_cfg_cap in header.

We already have VIRTIO_PCI_CAP_PCI_CFG, let's define the structure that
goes with it.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_pci.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 75301468359f..90007a1abcab 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -157,6 +157,12 @@ struct virtio_pci_common_cfg {
 	__le32 queue_used_hi;		/* read-write */
 };
 
+/* Fields in VIRTIO_PCI_CAP_PCI_CFG: */
+struct virtio_pci_cfg_cap {
+	struct virtio_pci_cap cap;
+	__u8 pci_cfg_data[4]; /* Data for BAR access. */
+};
+
 /* Macro versions of offsets for the Old Timers! */
 #define VIRTIO_PCI_CAP_VNDR		0
 #define VIRTIO_PCI_CAP_NEXT		1
-- 
cgit v1.2.3-71-gd317


From 5f867db63473f32cce1b868e281ebd42a41f8fad Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Fri, 26 Jun 2015 19:43:58 -0500
Subject: mtd: nand: Fix NAND_USE_BOUNCE_BUFFER flag conflict

Commit 66507c7bc8895f0da6b ("mtd: nand: Add support to use nand_base
poi databuf as bounce buffer") added a flag NAND_USE_BOUNCE_BUFFER
using the same bit value as the existing NAND_BUSWIDTH_AUTO.

Cc: Kamal Dasu <kdasu.kdev@gmail.com>
Fixes: 66507c7bc8895f0da6b ("mtd: nand: Add support to use nand_base
	poi databuf as bounce buffer")
Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 include/linux/mtd/nand.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index f25e2bdd188c..272f42952f34 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -177,11 +177,6 @@ typedef enum {
 #define NAND_OWN_BUFFERS	0x00020000
 /* Chip may not exist, so silence any errors in scan */
 #define NAND_SCAN_SILENT_NODEV	0x00040000
-/*
- * This option could be defined by controller drivers to protect against
- * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers
- */
-#define NAND_USE_BOUNCE_BUFFER	0x00080000
 /*
  * Autodetect nand buswidth with readid/onfi.
  * This suppose the driver will configure the hardware in 8 bits mode
@@ -189,6 +184,11 @@ typedef enum {
  * before calling nand_scan_tail.
  */
 #define NAND_BUSWIDTH_AUTO      0x00080000
+/*
+ * This option could be defined by controller drivers to protect against
+ * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers
+ */
+#define NAND_USE_BOUNCE_BUFFER	0x00100000
 
 /* Options set by nand scan */
 /* Nand scan has allocated controller struct */
-- 
cgit v1.2.3-71-gd317


From c9ddbac9c89110f77cb0fa07e634aaf1194899aa Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 14 Jul 2015 18:27:46 -0500
Subject: PCI: Restore PCI_MSIX_FLAGS_BIRMASK definition

09a2c73ddfc7 ("PCI: Remove unused PCI_MSIX_FLAGS_BIRMASK definition")
removed PCI_MSIX_FLAGS_BIRMASK from an exported header because it was
unused in the kernel.  But that breaks user programs that were using it
(QEMU in particular).

Restore the PCI_MSIX_FLAGS_BIRMASK definition.

[bhelgaas: changelog]
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: stable@vger.kernel.org	# v3.13+
---
 include/uapi/linux/pci_regs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index efe3443572ba..413417f3707b 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -319,6 +319,7 @@
 #define PCI_MSIX_PBA		8	/* Pending Bit Array offset */
 #define  PCI_MSIX_PBA_BIR	0x00000007 /* BAR index */
 #define  PCI_MSIX_PBA_OFFSET	0xfffffff8 /* Offset into specified BAR */
+#define PCI_MSIX_FLAGS_BIRMASK	PCI_MSIX_PBA_BIR /* deprecated */
 #define PCI_CAP_MSIX_SIZEOF	12	/* size of MSIX registers */
 
 /* MSI-X Table entry format */
-- 
cgit v1.2.3-71-gd317


From 4c62360d7562a20c996836d163259c87d9378120 Mon Sep 17 00:00:00 2001
From: "Luck, Tony" <tony.luck@intel.com>
Date: Tue, 30 Jun 2015 15:57:51 -0700
Subject: efi: Handle memory error structures produced based on old versions of
 standard

The memory error record structure includes as its first field a
bitmask of which subsequent fields are valid. The allows new fields
to be added to the structure while keeping compatibility with older
software that parses these records. This mechanism was used between
versions 2.2 and 2.3 to add four new fields, growing the size of the
structure from 73 bytes to 80. But Linux just added all the new
fields so this test:
	if (gdata->error_data_length >= sizeof(*mem_err))
		cper_print_mem(newpfx, mem_err);
	else
		goto err_section_too_small;
now make Linux complain about old format records being too short.

Add a definition for the old format of the structure and use that
for the minimum size check. Pass the actual size to cper_print_mem()
so it can sanity check the validation_bits field to ensure that if
a BIOS using the old format sets bits as if it were new, we won't
access fields beyond the end of the structure.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 drivers/firmware/efi/cper.c | 15 ++++++++++++---
 include/linux/cper.h        | 22 +++++++++++++++++++++-
 2 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 4fd9961d552e..d42537425438 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -305,10 +305,17 @@ const char *cper_mem_err_unpack(struct trace_seq *p,
 	return ret;
 }
 
-static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
+static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem,
+	int len)
 {
 	struct cper_mem_err_compact cmem;
 
+	/* Don't trust UEFI 2.1/2.2 structure with bad validation bits */
+	if (len == sizeof(struct cper_sec_mem_err_old) &&
+	    (mem->validation_bits & ~(CPER_MEM_VALID_RANK_NUMBER - 1))) {
+		pr_err(FW_WARN "valid bits set for fields beyond structure\n");
+		return;
+	}
 	if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
 		printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
 	if (mem->validation_bits & CPER_MEM_VALID_PA)
@@ -405,8 +412,10 @@ static void cper_estatus_print_section(
 	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
 		struct cper_sec_mem_err *mem_err = (void *)(gdata + 1);
 		printk("%s""section_type: memory error\n", newpfx);
-		if (gdata->error_data_length >= sizeof(*mem_err))
-			cper_print_mem(newpfx, mem_err);
+		if (gdata->error_data_length >=
+		    sizeof(struct cper_sec_mem_err_old))
+			cper_print_mem(newpfx, mem_err,
+				       gdata->error_data_length);
 		else
 			goto err_section_too_small;
 	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 76abba4b238e..dcacb1a72e26 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -340,7 +340,27 @@ struct cper_ia_proc_ctx {
 	__u64	mm_reg_addr;
 };
 
-/* Memory Error Section */
+/* Old Memory Error Section UEFI 2.1, 2.2 */
+struct cper_sec_mem_err_old {
+	__u64	validation_bits;
+	__u64	error_status;
+	__u64	physical_addr;
+	__u64	physical_addr_mask;
+	__u16	node;
+	__u16	card;
+	__u16	module;
+	__u16	bank;
+	__u16	device;
+	__u16	row;
+	__u16	column;
+	__u16	bit_pos;
+	__u64	requestor_id;
+	__u64	responder_id;
+	__u64	target_id;
+	__u8	error_type;
+};
+
+/* Memory Error Section UEFI >= 2.3 */
 struct cper_sec_mem_err {
 	__u64	validation_bits;
 	__u64	error_status;
-- 
cgit v1.2.3-71-gd317


From 71d126fd28de2d4d9b7b2088dbccd7ca62fad6e0 Mon Sep 17 00:00:00 2001
From: Arne Fitzenreiter <arne_f@ipfire.org>
Date: Wed, 15 Jul 2015 13:54:36 +0200
Subject: libata: add ATA_HORKAGE_NOTRIM

Some devices lose data on TRIM whether queued or not.  This patch adds
a horkage to disable TRIM.

tj: Collapsed unnecessary if() nesting.

Signed-off-by: Arne Fitzenreiter <arne_f@ipfire.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 drivers/ata/libata-scsi.c      | 3 ++-
 drivers/ata/libata-transport.c | 2 ++
 include/linux/libata.h         | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 3131adcc1f87..641a61a59e89 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2568,7 +2568,8 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
 		rbuf[14] = (lowest_aligned >> 8) & 0x3f;
 		rbuf[15] = lowest_aligned;
 
-		if (ata_id_has_trim(args->id)) {
+		if (ata_id_has_trim(args->id) &&
+		    !(dev->horkage & ATA_HORKAGE_NOTRIM)) {
 			rbuf[14] |= 0x80; /* LBPME */
 
 			if (ata_id_has_zero_after_trim(args->id) &&
diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c
index d6c37bcd416d..e2d94972962d 100644
--- a/drivers/ata/libata-transport.c
+++ b/drivers/ata/libata-transport.c
@@ -569,6 +569,8 @@ show_ata_dev_trim(struct device *dev,
 
 	if (!ata_id_has_trim(ata_dev->id))
 		mode = "unsupported";
+	else if (ata_dev->horkage & ATA_HORKAGE_NOTRIM)
+		mode = "forced_unsupported";
 	else if (ata_dev->horkage & ATA_HORKAGE_NO_NCQ_TRIM)
 			mode = "forced_unqueued";
 	else if (ata_fpdma_dsm_supported(ata_dev))
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 36ce37bcc963..5c8bac6225a6 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -431,6 +431,8 @@ enum {
 	ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21),	/* some WDs have broken LPM */
 	ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */
 	ATA_HORKAGE_NO_NCQ_LOG	= (1 << 23),	/* don't use NCQ for log read */
+	ATA_HORKAGE_NOTRIM	= (1 << 24),	/* don't use TRIM */
+
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3-71-gd317


From af34d637637eabaf49406eb35c948cd51ba262a6 Mon Sep 17 00:00:00 2001
From: David Milburn <dmilburn@redhat.com>
Date: Mon, 13 Jul 2015 11:48:23 -0500
Subject: libata: add ATA_HORKAGE_MAX_SEC_1024 to revert back to previous
 max_sectors limit

Since no longer limiting max_sectors to BLK_DEF_MAX_SECTORS (commit 34b48db66e08),
data corruption may occur on ST380013AS drive configured on 82801JI (ICH10 Family)
SATA controller. This patch will allow the driver to limit max_sectors as before

 # cat /sys/block/sdb/queue/max_sectors_kb
 512

I was able to double the max_sectors_kb value up to 16384 on linux-4.2.0-rc2
before seeing corruption, but seems safer to use previous limit. Without this
patch max_sectors_kb will be 32767.

tj: Minor comment update.

Reported-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David Milburn <dmilburn@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org # v3.19 and later
Fixes: 34b48db66e08 ("block: remove artifical max_hw_sectors cap")
---
 drivers/ata/libata-core.c | 10 ++++++++++
 include/linux/ata.h       |  1 +
 include/linux/libata.h    |  2 +-
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index ed2b218ea64d..68202a8a3a0b 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2478,6 +2478,10 @@ int ata_dev_configure(struct ata_device *dev)
 		dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_128,
 					 dev->max_sectors);
 
+	if (dev->horkage & ATA_HORKAGE_MAX_SEC_1024)
+		dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_1024,
+					 dev->max_sectors);
+
 	if (dev->horkage & ATA_HORKAGE_MAX_SEC_LBA48)
 		dev->max_sectors = ATA_MAX_SECTORS_LBA48;
 
@@ -4146,6 +4150,12 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "Slimtype DVD A  DS8A8SH", NULL,	ATA_HORKAGE_MAX_SEC_LBA48 },
 	{ "Slimtype DVD A  DS8A9SH", NULL,	ATA_HORKAGE_MAX_SEC_LBA48 },
 
+	/*
+	 * Causes silent data corruption with higher max sects.
+	 * http://lkml.kernel.org/g/x49wpy40ysk.fsf@segfault.boston.devel.redhat.com
+	 */
+	{ "ST380013AS",		"3.20",		ATA_HORKAGE_MAX_SEC_1024 },
+
 	/* Devices we expect to fail diagnostics */
 
 	/* Devices where NCQ should be avoided */
diff --git a/include/linux/ata.h b/include/linux/ata.h
index fed36418dd1c..6c78956aa470 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -45,6 +45,7 @@ enum {
 	ATA_SECT_SIZE		= 512,
 	ATA_MAX_SECTORS_128	= 128,
 	ATA_MAX_SECTORS		= 256,
+	ATA_MAX_SECTORS_1024    = 1024,
 	ATA_MAX_SECTORS_LBA48	= 65535,/* TODO: 65536? */
 	ATA_MAX_SECTORS_TAPE	= 65535,
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5c8bac6225a6..c9cfbcdb8d14 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -432,7 +432,7 @@ enum {
 	ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */
 	ATA_HORKAGE_NO_NCQ_LOG	= (1 << 23),	/* don't use NCQ for log read */
 	ATA_HORKAGE_NOTRIM	= (1 << 24),	/* don't use TRIM */
-
+	ATA_HORKAGE_MAX_SEC_1024 = (1 << 25),	/* Limit max sects to 1024 */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3-71-gd317


From 03645a11a570d52e70631838cb786eb4253eb463 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Jul 2015 08:10:22 +0200
Subject: ipv6: lock socket in ip6_datagram_connect()

ip6_datagram_connect() is doing a lot of socket changes without
socket being locked.

This looks wrong, at least for udp_lib_rehash() which could corrupt
lists because of concurrent udp_sk(sk)->udp_portaddr_hash accesses.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h    |  1 +
 net/ipv4/datagram.c | 16 ++++++++++++----
 net/ipv6/datagram.c | 20 +++++++++++++++-----
 3 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 0750a186ea63..d5fe9f2ab699 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -161,6 +161,7 @@ static inline __u8 get_rtconn_flags(struct ipcm_cookie* ipc, struct sock* sk)
 }
 
 /* datagram.c */
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 
 void ip4_datagram_release_cb(struct sock *sk);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 90c0e8386116..574fad9cca05 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -20,7 +20,7 @@
 #include <net/route.h>
 #include <net/tcp_states.h>
 
-int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
@@ -39,8 +39,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	sk_dst_reset(sk);
 
-	lock_sock(sk);
-
 	oif = sk->sk_bound_dev_if;
 	saddr = inet->inet_saddr;
 	if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
@@ -82,9 +80,19 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	sk_dst_set(sk, &rt->dst);
 	err = 0;
 out:
-	release_sock(sk);
 	return err;
 }
+EXPORT_SYMBOL(__ip4_datagram_connect);
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	int res;
+
+	lock_sock(sk);
+	res = __ip4_datagram_connect(sk, uaddr, addr_len);
+	release_sock(sk);
+	return res;
+}
 EXPORT_SYMBOL(ip4_datagram_connect);
 
 /* Because UDP xmit path can manipulate sk_dst_cache without holding
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 62d908e64eeb..b10a88986a98 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -40,7 +40,7 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a)
 	return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0);
 }
 
-int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
 	struct inet_sock	*inet = inet_sk(sk);
@@ -56,7 +56,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (usin->sin6_family == AF_INET) {
 		if (__ipv6_only_sock(sk))
 			return -EAFNOSUPPORT;
-		err = ip4_datagram_connect(sk, uaddr, addr_len);
+		err = __ip4_datagram_connect(sk, uaddr, addr_len);
 		goto ipv4_connected;
 	}
 
@@ -98,9 +98,9 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		sin.sin_addr.s_addr = daddr->s6_addr32[3];
 		sin.sin_port = usin->sin6_port;
 
-		err = ip4_datagram_connect(sk,
-					   (struct sockaddr *) &sin,
-					   sizeof(sin));
+		err = __ip4_datagram_connect(sk,
+					     (struct sockaddr *) &sin,
+					     sizeof(sin));
 
 ipv4_connected:
 		if (err)
@@ -204,6 +204,16 @@ out:
 	fl6_sock_release(flowlabel);
 	return err;
 }
+
+int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	int res;
+
+	lock_sock(sk);
+	res = __ip6_datagram_connect(sk, uaddr, addr_len);
+	release_sock(sk);
+	return res;
+}
 EXPORT_SYMBOL_GPL(ip6_datagram_connect);
 
 int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr,
-- 
cgit v1.2.3-71-gd317


From 923b352f19d9ea971ae2536eab55f5fc9e95fedf Mon Sep 17 00:00:00 2001
From: Arik Nemtsov <arik@wizery.com>
Date: Wed, 8 Jul 2015 15:41:44 +0300
Subject: cfg80211: use RTNL locked reg_can_beacon for IR-relaxation

The RTNL is required to check for IR-relaxation conditions that allow
more channels to beacon. Export an RTNL locked version of reg_can_beacon
and use it where possible in AP/STA interface type flows, where
IR-relaxation may be applicable.

Fixes: 06f207fc5418 ("cfg80211: change GO_CONCURRENT to IR_CONCURRENT for STA")
Signed-off-by: Arik Nemtsov <arikx.nemtsov@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 17 +++++++++++++++++
 net/mac80211/tdls.c    |  6 +++---
 net/wireless/chan.c    | 45 ++++++++++++++++++++++++++++++++++-----------
 net/wireless/nl80211.c | 14 ++++++++------
 net/wireless/reg.c     |  2 +-
 net/wireless/trace.h   | 11 +++++++----
 6 files changed, 70 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a741678f24a2..883fe1e7c5a1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4868,6 +4868,23 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
 			     struct cfg80211_chan_def *chandef,
 			     enum nl80211_iftype iftype);
 
+/**
+ * cfg80211_reg_can_beacon_relax - check if beaconing is allowed with relaxation
+ * @wiphy: the wiphy
+ * @chandef: the channel definition
+ * @iftype: interface type
+ *
+ * Return: %true if there is no secondary channel or the secondary channel(s)
+ * can be used for beaconing (i.e. is not a radar channel etc.). This version
+ * also checks if IR-relaxation conditions apply, to allow beaconing under
+ * more permissive conditions.
+ *
+ * Requires the RTNL to be held.
+ */
+bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
+				   struct cfg80211_chan_def *chandef,
+				   enum nl80211_iftype iftype);
+
 /*
  * cfg80211_ch_switch_notify - update wdev channel and notify userspace
  * @dev: the device which switched channels
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index ad31b2dab4f5..8db6e2994bbc 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -60,6 +60,7 @@ ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_channel *ch;
 	struct cfg80211_chan_def chandef;
 	int i, subband_start;
+	struct wiphy *wiphy = sdata->local->hw.wiphy;
 
 	for (i = start; i <= end; i += spacing) {
 		if (!ch_cnt)
@@ -70,9 +71,8 @@ ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata,
 			/* we will be active on the channel */
 			cfg80211_chandef_create(&chandef, ch,
 						NL80211_CHAN_NO_HT);
-			if (cfg80211_reg_can_beacon(sdata->local->hw.wiphy,
-						    &chandef,
-						    sdata->wdev.iftype)) {
+			if (cfg80211_reg_can_beacon_relax(wiphy, &chandef,
+							  sdata->wdev.iftype)) {
 				ch_cnt++;
 				/*
 				 * check if the next channel is also part of
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 915b328b9ac5..59cabc9bce69 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -797,23 +797,18 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
 	return false;
 }
 
-bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
-			     struct cfg80211_chan_def *chandef,
-			     enum nl80211_iftype iftype)
+static bool _cfg80211_reg_can_beacon(struct wiphy *wiphy,
+				     struct cfg80211_chan_def *chandef,
+				     enum nl80211_iftype iftype,
+				     bool check_no_ir)
 {
 	bool res;
 	u32 prohibited_flags = IEEE80211_CHAN_DISABLED |
 			       IEEE80211_CHAN_RADAR;
 
-	trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype);
+	trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir);
 
-	/*
-	 * Under certain conditions suggested by some regulatory bodies a
-	 * GO/STA can IR on channels marked with IEEE80211_NO_IR. Set this flag
-	 * only if such relaxations are not enabled and the conditions are not
-	 * met.
-	 */
-	if (!cfg80211_ir_permissive_chan(wiphy, iftype, chandef->chan))
+	if (check_no_ir)
 		prohibited_flags |= IEEE80211_CHAN_NO_IR;
 
 	if (cfg80211_chandef_dfs_required(wiphy, chandef, iftype) > 0 &&
@@ -827,8 +822,36 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
 	trace_cfg80211_return_bool(res);
 	return res;
 }
+
+bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
+			     struct cfg80211_chan_def *chandef,
+			     enum nl80211_iftype iftype)
+{
+	return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, true);
+}
 EXPORT_SYMBOL(cfg80211_reg_can_beacon);
 
+bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
+				   struct cfg80211_chan_def *chandef,
+				   enum nl80211_iftype iftype)
+{
+	bool check_no_ir;
+
+	ASSERT_RTNL();
+
+	/*
+	 * Under certain conditions suggested by some regulatory bodies a
+	 * GO/STA can IR on channels marked with IEEE80211_NO_IR. Set this flag
+	 * only if such relaxations are not enabled and the conditions are not
+	 * met.
+	 */
+	check_no_ir = !cfg80211_ir_permissive_chan(wiphy, iftype,
+						   chandef->chan);
+
+	return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir);
+}
+EXPORT_SYMBOL(cfg80211_reg_can_beacon_relax);
+
 int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev,
 				 struct cfg80211_chan_def *chandef)
 {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c264effd00a6..76b41578a838 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2003,7 +2003,8 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 	switch (iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
-		if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, iftype)) {
+		if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef,
+						   iftype)) {
 			result = -EINVAL;
 			break;
 		}
@@ -3403,8 +3404,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	} else if (!nl80211_get_ap_channel(rdev, &params))
 		return -EINVAL;
 
-	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef,
-				     wdev->iftype))
+	if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &params.chandef,
+					   wdev->iftype))
 		return -EINVAL;
 
 	if (info->attrs[NL80211_ATTR_ACL_POLICY]) {
@@ -6492,8 +6493,8 @@ skip_beacons:
 	if (err)
 		return err;
 
-	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef,
-				     wdev->iftype))
+	if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &params.chandef,
+					   wdev->iftype))
 		return -EINVAL;
 
 	err = cfg80211_chandef_dfs_required(wdev->wiphy,
@@ -10170,7 +10171,8 @@ static int nl80211_tdls_channel_switch(struct sk_buff *skb,
 		return -EINVAL;
 
 	/* we will be active on the TDLS link */
-	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, wdev->iftype))
+	if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef,
+					   wdev->iftype))
 		return -EINVAL;
 
 	/* don't allow switching to DFS channels */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 29134c81e73c..aa2d75482017 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1589,7 +1589,7 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
 	case NL80211_IFTYPE_ADHOC:
-		return cfg80211_reg_can_beacon(wiphy, &chandef, iftype);
+		return cfg80211_reg_can_beacon_relax(wiphy, &chandef, iftype);
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
 		return cfg80211_chandef_usable(wiphy, &chandef,
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index af3617c9879e..a808279a432a 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2358,20 +2358,23 @@ TRACE_EVENT(cfg80211_cqm_rssi_notify,
 
 TRACE_EVENT(cfg80211_reg_can_beacon,
 	TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef,
-		 enum nl80211_iftype iftype),
-	TP_ARGS(wiphy, chandef, iftype),
+		 enum nl80211_iftype iftype, bool check_no_ir),
+	TP_ARGS(wiphy, chandef, iftype, check_no_ir),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		CHAN_DEF_ENTRY
 		__field(enum nl80211_iftype, iftype)
+		__field(bool, check_no_ir)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		CHAN_DEF_ASSIGN(chandef);
 		__entry->iftype = iftype;
+		__entry->check_no_ir = check_no_ir;
 	),
-	TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d",
-		  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype)
+	TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d check_no_ir=%s",
+		  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype,
+		  BOOL_TO_STR(__entry->check_no_ir))
 );
 
 TRACE_EVENT(cfg80211_chandef_dfs_required,
-- 
cgit v1.2.3-71-gd317


From 0838aa7fcfcd875caa7bcc5dab0c3fd40444553d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 13 Jul 2015 15:11:48 +0200
Subject: netfilter: fix netns dependencies with conntrack templates

Quoting Daniel Borkmann:

"When adding connection tracking template rules to a netns, f.e. to
configure netfilter zones, the kernel will endlessly busy-loop as soon
as we try to delete the given netns in case there's at least one
template present, which is problematic i.e. if there is such bravery that
the priviledged user inside the netns is assumed untrusted.

Minimal example:

  ip netns add foo
  ip netns exec foo iptables -t raw -A PREROUTING -d 1.2.3.4 -j CT --zone 1
  ip netns del foo

What happens is that when nf_ct_iterate_cleanup() is being called from
nf_conntrack_cleanup_net_list() for a provided netns, we always end up
with a net->ct.count > 0 and thus jump back to i_see_dead_people. We
don't get a soft-lockup as we still have a schedule() point, but the
serving CPU spins on 100% from that point onwards.

Since templates are normally allocated with nf_conntrack_alloc(), we
also bump net->ct.count. The issue why they are not yet nf_ct_put() is
because the per netns .exit() handler from x_tables (which would eventually
invoke xt_CT's xt_ct_tg_destroy() that drops reference on info->ct) is
called in the dependency chain at a *later* point in time than the per
netns .exit() handler for the connection tracker.

This is clearly a chicken'n'egg problem: after the connection tracker
.exit() handler, we've teared down all the connection tracking
infrastructure already, so rightfully, xt_ct_tg_destroy() cannot be
invoked at a later point in time during the netns cleanup, as that would
lead to a use-after-free. At the same time, we cannot make x_tables depend
on the connection tracker module, so that the xt_ct_tg_destroy() would
be invoked earlier in the cleanup chain."

Daniel confirms this has to do with the order in which modules are loaded or
having compiled nf_conntrack as modules while x_tables built-in. So we have no
guarantees regarding the order in which netns callbacks are executed.

Fix this by allocating the templates through kmalloc() from the respective
SYNPROXY and CT targets, so they don't depend on the conntrack kmem cache.
Then, release then via nf_ct_tmpl_free() from destroy_conntrack(). This branch
is marked as unlikely since conntrack templates are rarely allocated and only
from the configuration plane path.

Note that templates are not kept in any list to avoid further dependencies with
nf_conntrack anymore, thus, the tmpl larval list is removed.

Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Tested-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/netfilter/nf_conntrack.h |  2 +-
 include/net/netns/conntrack.h        |  1 -
 net/netfilter/nf_conntrack_core.c    | 67 +++++++++++++++++++++++-------------
 net/netfilter/nf_synproxy_core.c     |  7 ++--
 net/netfilter/xt_CT.c                |  8 ++---
 5 files changed, 51 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 095433b8a8b0..37cd3911d5c5 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -291,7 +291,7 @@ extern unsigned int nf_conntrack_max;
 extern unsigned int nf_conntrack_hash_rnd;
 void init_nf_conntrack_hash_rnd(void);
 
-void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl);
+struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags);
 
 #define NF_CT_STAT_INC(net, count)	  __this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 29d6a94db54d..723b61c82b3f 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -68,7 +68,6 @@ struct ct_pcpu {
 	spinlock_t		lock;
 	struct hlist_nulls_head unconfirmed;
 	struct hlist_nulls_head dying;
-	struct hlist_nulls_head tmpl;
 };
 
 struct netns_ct {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 13fad8668f83..651039ad1681 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -287,6 +287,46 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
 	spin_unlock(&pcpu->lock);
 }
 
+/* Released via destroy_conntrack() */
+struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags)
+{
+	struct nf_conn *tmpl;
+
+	tmpl = kzalloc(sizeof(struct nf_conn), GFP_KERNEL);
+	if (tmpl == NULL)
+		return NULL;
+
+	tmpl->status = IPS_TEMPLATE;
+	write_pnet(&tmpl->ct_net, net);
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	if (zone) {
+		struct nf_conntrack_zone *nf_ct_zone;
+
+		nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC);
+		if (!nf_ct_zone)
+			goto out_free;
+		nf_ct_zone->id = zone;
+	}
+#endif
+	atomic_set(&tmpl->ct_general.use, 0);
+
+	return tmpl;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+out_free:
+	kfree(tmpl);
+	return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
+
+static void nf_ct_tmpl_free(struct nf_conn *tmpl)
+{
+	nf_ct_ext_destroy(tmpl);
+	nf_ct_ext_free(tmpl);
+	kfree(tmpl);
+}
+
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
@@ -298,6 +338,10 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
 	NF_CT_ASSERT(!timer_pending(&ct->timeout));
 
+	if (unlikely(nf_ct_is_template(ct))) {
+		nf_ct_tmpl_free(ct);
+		return;
+	}
 	rcu_read_lock();
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 	if (l4proto && l4proto->destroy)
@@ -540,28 +584,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 
-/* deletion from this larval template list happens via nf_ct_put() */
-void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
-{
-	struct ct_pcpu *pcpu;
-
-	__set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
-	__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
-	nf_conntrack_get(&tmpl->ct_general);
-
-	/* add this conntrack to the (per cpu) tmpl list */
-	local_bh_disable();
-	tmpl->cpu = smp_processor_id();
-	pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
-
-	spin_lock(&pcpu->lock);
-	/* Overload tuple linked list to put us in template list. */
-	hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-				 &pcpu->tmpl);
-	spin_unlock_bh(&pcpu->lock);
-}
-EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
-
 /* Confirm a connection given skb; places it in hash table */
 int
 __nf_conntrack_confirm(struct sk_buff *skb)
@@ -1751,7 +1773,6 @@ int nf_conntrack_init_net(struct net *net)
 		spin_lock_init(&pcpu->lock);
 		INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
 		INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
-		INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
 	}
 
 	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 789feeae6c44..71f1e9fdfa18 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -349,12 +349,10 @@ static void __net_exit synproxy_proc_exit(struct net *net)
 static int __net_init synproxy_net_init(struct net *net)
 {
 	struct synproxy_net *snet = synproxy_pernet(net);
-	struct nf_conntrack_tuple t;
 	struct nf_conn *ct;
 	int err = -ENOMEM;
 
-	memset(&t, 0, sizeof(t));
-	ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL);
+	ct = nf_ct_tmpl_alloc(net, 0, GFP_KERNEL);
 	if (IS_ERR(ct)) {
 		err = PTR_ERR(ct);
 		goto err1;
@@ -365,7 +363,8 @@ static int __net_init synproxy_net_init(struct net *net)
 	if (!nfct_synproxy_ext_add(ct))
 		goto err2;
 
-	nf_conntrack_tmpl_insert(net, ct);
+	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
+	nf_conntrack_get(&ct->ct_general);
 	snet->tmpl = ct;
 
 	snet->stats = alloc_percpu(struct synproxy_stats);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 75747aecdebe..c6630030c912 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -184,7 +184,6 @@ out:
 static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 			  struct xt_ct_target_info_v1 *info)
 {
-	struct nf_conntrack_tuple t;
 	struct nf_conn *ct;
 	int ret = -EOPNOTSUPP;
 
@@ -202,8 +201,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	if (ret < 0)
 		goto err1;
 
-	memset(&t, 0, sizeof(t));
-	ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL);
+	ct = nf_ct_tmpl_alloc(par->net, info->zone, GFP_KERNEL);
 	ret = PTR_ERR(ct);
 	if (IS_ERR(ct))
 		goto err2;
@@ -227,8 +225,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 		if (ret < 0)
 			goto err3;
 	}
-
-	nf_conntrack_tmpl_insert(par->net, ct);
+	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
+	nf_conntrack_get(&ct->ct_general);
 out:
 	info->ct = ct;
 	return 0;
-- 
cgit v1.2.3-71-gd317


From 648a9bc5308d952f2c80772301b339f73026f013 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 16 Jul 2015 12:37:56 +0100
Subject: drm/i915: Use two 32bit reads for select 64bit REG_READ ioctls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since the hardware sometimes mysteriously totally flummoxes the 64bit
read of a 64bit register when read using a single instruction, split the
read into two instructions. Since the read here is of automatically
incrementing timestamp counters, we also have to be very careful in
order to make sure that it does not increment between the two
instructions.

However, since userspace tried to workaround this issue and so enshrined
this ABI for a broken hardware read and in the process neglected that
the read only fails in some environments, we have to introduce a new
uABI flag for userspace to request the 2x32 bit accurate read of the
timestamp.

v2: Fix alignment check and include details of the workaround for
userspace.

Reported-by: Karol Herbst <freedesktop@karolherbst.de>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91317
Testcase: igt/gem_reg_read
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: stable@vger.kernel.org
Tested-by: Michał Winiarski <michal.winiarski@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_uncore.c | 26 +++++++++++++++++++-------
 include/uapi/drm/i915_drm.h         |  8 ++++++++
 2 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index a6d8a3ee7750..260389acfb77 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1274,10 +1274,12 @@ int i915_reg_read_ioctl(struct drm_device *dev,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_reg_read *reg = data;
 	struct register_whitelist const *entry = whitelist;
+	unsigned size;
+	u64 offset;
 	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(whitelist); i++, entry++) {
-		if (entry->offset == reg->offset &&
+		if (entry->offset == (reg->offset & -entry->size) &&
 		    (1 << INTEL_INFO(dev)->gen & entry->gen_bitmask))
 			break;
 	}
@@ -1285,23 +1287,33 @@ int i915_reg_read_ioctl(struct drm_device *dev,
 	if (i == ARRAY_SIZE(whitelist))
 		return -EINVAL;
 
+	/* We use the low bits to encode extra flags as the register should
+	 * be naturally aligned (and those that are not so aligned merely
+	 * limit the available flags for that register).
+	 */
+	offset = entry->offset;
+	size = entry->size;
+	size |= reg->offset ^ offset;
+
 	intel_runtime_pm_get(dev_priv);
 
-	switch (entry->size) {
+	switch (size) {
+	case 8 | 1:
+		reg->val = I915_READ64_2x32(offset, offset+4);
+		break;
 	case 8:
-		reg->val = I915_READ64(reg->offset);
+		reg->val = I915_READ64(offset);
 		break;
 	case 4:
-		reg->val = I915_READ(reg->offset);
+		reg->val = I915_READ(offset);
 		break;
 	case 2:
-		reg->val = I915_READ16(reg->offset);
+		reg->val = I915_READ16(offset);
 		break;
 	case 1:
-		reg->val = I915_READ8(reg->offset);
+		reg->val = I915_READ8(offset);
 		break;
 	default:
-		MISSING_CASE(entry->size);
 		ret = -EINVAL;
 		goto out;
 	}
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 6e1a2ed116cb..db809b722985 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1070,6 +1070,14 @@ struct drm_i915_reg_read {
 	__u64 offset;
 	__u64 val; /* Return value */
 };
+/* Known registers:
+ *
+ * Render engine timestamp - 0x2358 + 64bit - gen7+
+ * - Note this register returns an invalid value if using the default
+ *   single instruction 8byte read, in order to workaround that use
+ *   offset (0x2538 | 1) instead.
+ *
+ */
 
 struct drm_i915_reset_stats {
 	__u32 ctx_id;
-- 
cgit v1.2.3-71-gd317


From cd812599796f500b042f5464b6665755eca21137 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 5 Jul 2015 11:12:07 -0400
Subject: NFS: Remove the "NFS_CAP_CHANGE_ATTR" capability

Setting the change attribute has been mandatory for all NFS versions, since
commit 3a1556e8662c ("NFSv2/v3: Simulate the change attribute"). We should
therefore not have anything be conditional on it being set/unset.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c           | 2 +-
 fs/nfs/inode.c            | 4 ++--
 fs/nfs/nfs4proc.c         | 3 ---
 include/linux/nfs_fs_sb.h | 2 +-
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ecebb406cc1a..4a90c9bb3135 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -775,7 +775,7 @@ static int nfs_init_server(struct nfs_server *server,
 	server->options = data->options;
 	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
 		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
-		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR;
+		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
 
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 426e4f8207ef..0adc7d245b3d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -442,7 +442,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			inode->i_version = fattr->change_attr;
-		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+		else
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
 				| NFS_INO_REVAL_PAGECACHE);
 		if (fattr->valid & NFS_ATTR_FATTR_SIZE)
@@ -1692,7 +1692,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				nfs_force_lookup_revalidate(inode);
 			inode->i_version = fattr->change_attr;
 		}
-	} else if (server->caps & NFS_CAP_CHANGE_ATTR)
+	} else
 		nfsi->cache_validity |= save_cache_validity;
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9264994ec9d3..c85ffe67b5f3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8591,7 +8591,6 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.minor_version = 0,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
-		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK,
 	.init_client = nfs40_init_client,
 	.shutdown_client = nfs40_shutdown_client,
@@ -8617,7 +8616,6 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.minor_version = 1,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
-		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1,
@@ -8640,7 +8638,6 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 	.minor_version = 2,
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
-		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
 		| NFS_CAP_ATOMIC_OPEN_V1
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index a2ea1491d3df..20bc8e51b161 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -220,7 +220,7 @@ struct nfs_server {
 #define NFS_CAP_SYMLINKS	(1U << 2)
 #define NFS_CAP_ACLS		(1U << 3)
 #define NFS_CAP_ATOMIC_OPEN	(1U << 4)
-#define NFS_CAP_CHANGE_ATTR	(1U << 5)
+/* #define NFS_CAP_CHANGE_ATTR	(1U << 5) */
 #define NFS_CAP_FILEID		(1U << 6)
 #define NFS_CAP_MODE		(1U << 7)
 #define NFS_CAP_NLINK		(1U << 8)
-- 
cgit v1.2.3-71-gd317


From 115c48d7a5351abeadd0c8a3dc87eca3d66a6475 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 5 Jul 2015 12:36:34 -0400
Subject: NFS: nfs_mark_for_revalidate should always set
 NFS_INO_REVAL_PAGECACHE

I'm not aware of any existing bugs around this, but the expectation is
that nfs_mark_for_revalidate() should always force a revalidation of
the cached metadata.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/nfs_fs.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f91b5ade30c9..874b77228fb9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -292,9 +292,12 @@ static inline void nfs_mark_for_revalidate(struct inode *inode)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR |
+				NFS_INO_REVAL_PAGECACHE |
+				NFS_INO_INVALID_ACCESS |
+				NFS_INO_INVALID_ACL;
 	if (S_ISDIR(inode->i_mode))
-		nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
+		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);
 }
 
-- 
cgit v1.2.3-71-gd317


From fa92754e9c47cf3e5607c0865f4cf59d090cda37 Mon Sep 17 00:00:00 2001
From: Leo Liu <leo.liu@amd.com>
Date: Mon, 13 Jul 2015 12:46:23 -0400
Subject: drm/amdgpu: add VCE harvesting instance query
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Leo Liu <leo.liu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 1 +
 include/uapi/drm/amdgpu_drm.h           | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 5533434c7a8f..31ad444c6386 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -459,6 +459,7 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file
 		memcpy(&dev_info.cu_bitmap[0], &cu_info.bitmap[0], sizeof(cu_info.bitmap));
 		dev_info.vram_type = adev->mc.vram_type;
 		dev_info.vram_bit_width = adev->mc.vram_width;
+		dev_info.vce_harvest_config = adev->vce.harvest_config;
 
 		return copy_to_user(out, &dev_info,
 				    min((size_t)size, sizeof(dev_info))) ? -EFAULT : 0;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index b6fce900a833..d708a53b8fb1 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -614,6 +614,8 @@ struct drm_amdgpu_info_device {
 	uint32_t vram_type;
 	/** video memory bit width*/
 	uint32_t vram_bit_width;
+	/* vce harvesting instance */
+	uint32_t vce_harvest_config;
 };
 
 struct drm_amdgpu_info_hw_ip {
-- 
cgit v1.2.3-71-gd317


From a3bd4f989f532694337dd30538b635d5213ab86a Mon Sep 17 00:00:00 2001
From: Dong Aisheng <aisheng.dong@freescale.com>
Date: Wed, 22 Jul 2015 20:53:09 +0800
Subject: mmc: sdhci-esdhc-imx: clear f_max in boarddata

After commit 8d86e4fcccf6 ("mmc: sdhci-esdhc-imx: Call mmc_of_parse()"),
it's not used anymore.

Signed-off-by: Dong Aisheng <aisheng.dong@freescale.com>
Reviewed-by: Johan Derycke <johan.derycke@barco.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci-esdhc-imx.c          | 7 +------
 include/linux/platform_data/mmc-esdhc-imx.h | 1 -
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index 1b0e61847e73..c6b9f6492e1a 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -581,13 +581,8 @@ static void esdhc_writeb_le(struct sdhci_host *host, u8 val, int reg)
 static unsigned int esdhc_pltfm_get_max_clock(struct sdhci_host *host)
 {
 	struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
-	struct pltfm_imx_data *imx_data = pltfm_host->priv;
-	struct esdhc_platform_data *boarddata = &imx_data->boarddata;
 
-	if (boarddata->f_max && (boarddata->f_max < pltfm_host->clock))
-		return boarddata->f_max;
-	else
-		return pltfm_host->clock;
+	return pltfm_host->clock;
 }
 
 static unsigned int esdhc_pltfm_get_min_clock(struct sdhci_host *host)
diff --git a/include/linux/platform_data/mmc-esdhc-imx.h b/include/linux/platform_data/mmc-esdhc-imx.h
index 75f70f6ac137..e1571efa3f2b 100644
--- a/include/linux/platform_data/mmc-esdhc-imx.h
+++ b/include/linux/platform_data/mmc-esdhc-imx.h
@@ -43,7 +43,6 @@ struct esdhc_platform_data {
 	enum wp_types wp_type;
 	enum cd_types cd_type;
 	int max_bus_width;
-	unsigned int f_max;
 	bool support_vsel;
 	unsigned int delay_line;
 };
-- 
cgit v1.2.3-71-gd317


From e3eea1404f5ff7a2ceb7b5e7ba412a6fd94f2935 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 24 Jul 2015 10:38:12 -0400
Subject: ftrace: Fix breakage of set_ftrace_pid

Commit 4104d326b670 ("ftrace: Remove global function list and call function
directly") simplified the ftrace code by removing the global_ops list with a
new design. But this cleanup also broke the filtering of PIDs that are added
to the set_ftrace_pid file.

Add back the proper hooks to have pid filtering working once again.

Cc: stable@vger.kernel.org # 3.16+
Reported-by: Matt Fleming <matt@console-pimps.org>
Reported-by: Richard Weinberger <richard.weinberger@gmail.com>
Tested-by: Matt Fleming <matt@console-pimps.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  3 +++
 kernel/trace/ftrace.c  | 52 +++++++++++++++++++++++++++++++++-----------------
 2 files changed, 37 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1da602982cf9..6cd8c0ee4b6f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -116,6 +116,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
  *            SAVE_REGS. If another ops with this flag set is already registered
  *            for any of the functions that this ops will be registered for, then
  *            this ops will fail to register or set_filter_ip.
+ * PID     - Is affected by set_ftrace_pid (allows filtering on those pids)
  */
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
@@ -132,6 +133,7 @@ enum {
 	FTRACE_OPS_FL_MODIFYING			= 1 << 11,
 	FTRACE_OPS_FL_ALLOC_TRAMP		= 1 << 12,
 	FTRACE_OPS_FL_IPMODIFY			= 1 << 13,
+	FTRACE_OPS_FL_PID			= 1 << 14,
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -159,6 +161,7 @@ struct ftrace_ops {
 	struct ftrace_ops		*next;
 	unsigned long			flags;
 	void				*private;
+	ftrace_func_t			saved_func;
 	int __percpu			*disabled;
 #ifdef CONFIG_DYNAMIC_FTRACE
 	int				nr_trampolines;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02bece4a99ea..eb11011b5292 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -98,6 +98,13 @@ struct ftrace_pid {
 	struct pid *pid;
 };
 
+static bool ftrace_pids_enabled(void)
+{
+	return !list_empty(&ftrace_pids);
+}
+
+static void ftrace_update_trampoline(struct ftrace_ops *ops);
+
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -109,7 +116,6 @@ static DEFINE_MUTEX(ftrace_lock);
 static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
-ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 static struct ftrace_ops global_ops;
 static struct ftrace_ops control_ops;
 
@@ -183,14 +189,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
 	if (!test_tsk_trace_trace(current))
 		return;
 
-	ftrace_pid_function(ip, parent_ip, op, regs);
-}
-
-static void set_ftrace_pid_function(ftrace_func_t func)
-{
-	/* do not set ftrace_pid_function to itself! */
-	if (func != ftrace_pid_func)
-		ftrace_pid_function = func;
+	op->saved_func(ip, parent_ip, op, regs);
 }
 
 /**
@@ -202,7 +201,6 @@ static void set_ftrace_pid_function(ftrace_func_t func)
 void clear_ftrace_function(void)
 {
 	ftrace_trace_function = ftrace_stub;
-	ftrace_pid_function = ftrace_stub;
 }
 
 static void control_ops_disable_all(struct ftrace_ops *ops)
@@ -436,6 +434,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	} else
 		add_ftrace_ops(&ftrace_ops_list, ops);
 
+	/* Always save the function, and reset at unregistering */
+	ops->saved_func = ops->func;
+
+	if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+		ops->func = ftrace_pid_func;
+
 	ftrace_update_trampoline(ops);
 
 	if (ftrace_enabled)
@@ -463,15 +467,28 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	if (ftrace_enabled)
 		update_ftrace_function();
 
+	ops->func = ops->saved_func;
+
 	return 0;
 }
 
 static void ftrace_update_pid_func(void)
 {
+	bool enabled = ftrace_pids_enabled();
+	struct ftrace_ops *op;
+
 	/* Only do something if we are tracing something */
 	if (ftrace_trace_function == ftrace_stub)
 		return;
 
+	do_for_each_ftrace_op(op, ftrace_ops_list) {
+		if (op->flags & FTRACE_OPS_FL_PID) {
+			op->func = enabled ? ftrace_pid_func :
+				op->saved_func;
+			ftrace_update_trampoline(op);
+		}
+	} while_for_each_ftrace_op(op);
+
 	update_ftrace_function();
 }
 
@@ -1133,7 +1150,8 @@ static struct ftrace_ops global_ops = {
 	.local_hash.filter_hash		= EMPTY_HASH,
 	INIT_OPS_HASH(global_ops)
 	.flags				= FTRACE_OPS_FL_RECURSION_SAFE |
-					  FTRACE_OPS_FL_INITIALIZED,
+					  FTRACE_OPS_FL_INITIALIZED |
+					  FTRACE_OPS_FL_PID,
 };
 
 /*
@@ -5023,7 +5041,9 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
 
 static struct ftrace_ops global_ops = {
 	.func			= ftrace_stub,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
+				  FTRACE_OPS_FL_INITIALIZED |
+				  FTRACE_OPS_FL_PID,
 };
 
 static int __init ftrace_nodyn_init(void)
@@ -5080,11 +5100,6 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
 		if (WARN_ON(tr->ops->func != ftrace_stub))
 			printk("ftrace ops had %pS for function\n",
 			       tr->ops->func);
-		/* Only the top level instance does pid tracing */
-		if (!list_empty(&ftrace_pids)) {
-			set_ftrace_pid_function(func);
-			func = ftrace_pid_func;
-		}
 	}
 	tr->ops->func = func;
 	tr->ops->private = tr;
@@ -5371,7 +5386,7 @@ static void *fpid_start(struct seq_file *m, loff_t *pos)
 {
 	mutex_lock(&ftrace_lock);
 
-	if (list_empty(&ftrace_pids) && (!*pos))
+	if (!ftrace_pids_enabled() && (!*pos))
 		return (void *) 1;
 
 	return seq_list_start(&ftrace_pids, *pos);
@@ -5610,6 +5625,7 @@ static struct ftrace_ops graph_ops = {
 	.func			= ftrace_stub,
 	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
 				   FTRACE_OPS_FL_INITIALIZED |
+				   FTRACE_OPS_FL_PID |
 				   FTRACE_OPS_FL_STUB,
 #ifdef FTRACE_GRAPH_TRAMP_ADDR
 	.trampoline		= FTRACE_GRAPH_TRAMP_ADDR,
-- 
cgit v1.2.3-71-gd317


From e54198657b65625085834847ab6271087323ffea Mon Sep 17 00:00:00 2001
From: Nicholas Bellinger <nab@linux-iscsi.org>
Date: Wed, 22 Jul 2015 23:14:19 -0700
Subject: iscsi-target: Fix iscsit_start_kthreads failure OOPs

This patch fixes a regression introduced with the following commit
in v4.0-rc1 code, where a iscsit_start_kthreads() failure triggers
a NULL pointer dereference OOPs:

    commit 88dcd2dab5c23b1c9cfc396246d8f476c872f0ca
    Author: Nicholas Bellinger <nab@linux-iscsi.org>
    Date:   Thu Feb 26 22:19:15 2015 -0800

        iscsi-target: Convert iscsi_thread_set usage to kthread.h

To address this bug, move iscsit_start_kthreads() immediately
preceeding the transmit of last login response, before signaling
a successful transition into full-feature-phase within existing
iscsi_target_do_tx_login_io() logic.

This ensures that no target-side resource allocation failures can
occur after the final login response has been successfully sent.

Also, it adds a iscsi_conn->rx_login_comp to allow the RX thread
to sleep to prevent other socket related failures until the final
iscsi_post_login_handler() call is able to complete.

Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: <stable@vger.kernel.org> # v3.10+
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/iscsi/iscsi_target.c       | 18 ++++++++++---
 drivers/target/iscsi/iscsi_target_login.c | 45 ++++++++++++-------------------
 drivers/target/iscsi/iscsi_target_login.h |  3 ++-
 drivers/target/iscsi/iscsi_target_nego.c  | 34 ++++++++++++++++++++++-
 include/target/iscsi/iscsi_target_core.h  |  1 +
 5 files changed, 68 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index afab32376126..202a42858f25 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -3998,7 +3998,13 @@ get_immediate:
 	}
 
 transport_err:
-	iscsit_take_action_for_connection_exit(conn);
+	/*
+	 * Avoid the normal connection failure code-path if this connection
+	 * is still within LOGIN mode, and iscsi_np process context is
+	 * responsible for cleaning up the early connection failure.
+	 */
+	if (conn->conn_state != TARG_CONN_STATE_IN_LOGIN)
+		iscsit_take_action_for_connection_exit(conn);
 out:
 	return 0;
 }
@@ -4082,7 +4088,7 @@ reject:
 
 int iscsi_target_rx_thread(void *arg)
 {
-	int ret;
+	int ret, rc;
 	u8 buffer[ISCSI_HDR_LEN], opcode;
 	u32 checksum = 0, digest = 0;
 	struct iscsi_conn *conn = arg;
@@ -4092,10 +4098,16 @@ int iscsi_target_rx_thread(void *arg)
 	 * connection recovery / failure event can be triggered externally.
 	 */
 	allow_signal(SIGINT);
+	/*
+	 * Wait for iscsi_post_login_handler() to complete before allowing
+	 * incoming iscsi/tcp socket I/O, and/or failing the connection.
+	 */
+	rc = wait_for_completion_interruptible(&conn->rx_login_comp);
+	if (rc < 0)
+		return 0;
 
 	if (conn->conn_transport->transport_type == ISCSI_INFINIBAND) {
 		struct completion comp;
-		int rc;
 
 		init_completion(&comp);
 		rc = wait_for_completion_interruptible(&comp);
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 3d0fe4ff5590..7e8f65e5448f 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -82,6 +82,7 @@ static struct iscsi_login *iscsi_login_init_conn(struct iscsi_conn *conn)
 	init_completion(&conn->conn_logout_comp);
 	init_completion(&conn->rx_half_close_comp);
 	init_completion(&conn->tx_half_close_comp);
+	init_completion(&conn->rx_login_comp);
 	spin_lock_init(&conn->cmd_lock);
 	spin_lock_init(&conn->conn_usage_lock);
 	spin_lock_init(&conn->immed_queue_lock);
@@ -644,7 +645,7 @@ static void iscsi_post_login_start_timers(struct iscsi_conn *conn)
 		iscsit_start_nopin_timer(conn);
 }
 
-static int iscsit_start_kthreads(struct iscsi_conn *conn)
+int iscsit_start_kthreads(struct iscsi_conn *conn)
 {
 	int ret = 0;
 
@@ -679,6 +680,7 @@ static int iscsit_start_kthreads(struct iscsi_conn *conn)
 
 	return 0;
 out_tx:
+	send_sig(SIGINT, conn->tx_thread, 1);
 	kthread_stop(conn->tx_thread);
 	conn->tx_thread_active = false;
 out_bitmap:
@@ -689,7 +691,7 @@ out_bitmap:
 	return ret;
 }
 
-int iscsi_post_login_handler(
+void iscsi_post_login_handler(
 	struct iscsi_np *np,
 	struct iscsi_conn *conn,
 	u8 zero_tsih)
@@ -699,7 +701,6 @@ int iscsi_post_login_handler(
 	struct se_session *se_sess = sess->se_sess;
 	struct iscsi_portal_group *tpg = sess->tpg;
 	struct se_portal_group *se_tpg = &tpg->tpg_se_tpg;
-	int rc;
 
 	iscsit_inc_conn_usage_count(conn);
 
@@ -739,10 +740,6 @@ int iscsi_post_login_handler(
 			sess->sess_ops->InitiatorName);
 		spin_unlock_bh(&sess->conn_lock);
 
-		rc = iscsit_start_kthreads(conn);
-		if (rc)
-			return rc;
-
 		iscsi_post_login_start_timers(conn);
 		/*
 		 * Determine CPU mask to ensure connection's RX and TX kthreads
@@ -751,15 +748,20 @@ int iscsi_post_login_handler(
 		iscsit_thread_get_cpumask(conn);
 		conn->conn_rx_reset_cpumask = 1;
 		conn->conn_tx_reset_cpumask = 1;
-
+		/*
+		 * Wakeup the sleeping iscsi_target_rx_thread() now that
+		 * iscsi_conn is in TARG_CONN_STATE_LOGGED_IN state.
+		 */
+		complete(&conn->rx_login_comp);
 		iscsit_dec_conn_usage_count(conn);
+
 		if (stop_timer) {
 			spin_lock_bh(&se_tpg->session_lock);
 			iscsit_stop_time2retain_timer(sess);
 			spin_unlock_bh(&se_tpg->session_lock);
 		}
 		iscsit_dec_session_usage_count(sess);
-		return 0;
+		return;
 	}
 
 	iscsi_set_session_parameters(sess->sess_ops, conn->param_list, 1);
@@ -800,10 +802,6 @@ int iscsi_post_login_handler(
 		" iSCSI Target Portal Group: %hu\n", tpg->nsessions, tpg->tpgt);
 	spin_unlock_bh(&se_tpg->session_lock);
 
-	rc = iscsit_start_kthreads(conn);
-	if (rc)
-		return rc;
-
 	iscsi_post_login_start_timers(conn);
 	/*
 	 * Determine CPU mask to ensure connection's RX and TX kthreads
@@ -812,10 +810,12 @@ int iscsi_post_login_handler(
 	iscsit_thread_get_cpumask(conn);
 	conn->conn_rx_reset_cpumask = 1;
 	conn->conn_tx_reset_cpumask = 1;
-
+	/*
+	 * Wakeup the sleeping iscsi_target_rx_thread() now that
+	 * iscsi_conn is in TARG_CONN_STATE_LOGGED_IN state.
+	 */
+	complete(&conn->rx_login_comp);
 	iscsit_dec_conn_usage_count(conn);
-
-	return 0;
 }
 
 static void iscsi_handle_login_thread_timeout(unsigned long data)
@@ -1380,23 +1380,12 @@ static int __iscsi_target_login_thread(struct iscsi_np *np)
 	if (ret < 0)
 		goto new_sess_out;
 
-	if (!conn->sess) {
-		pr_err("struct iscsi_conn session pointer is NULL!\n");
-		goto new_sess_out;
-	}
-
 	iscsi_stop_login_thread_timer(np);
 
-	if (signal_pending(current))
-		goto new_sess_out;
-
 	if (ret == 1) {
 		tpg_np = conn->tpg_np;
 
-		ret = iscsi_post_login_handler(np, conn, zero_tsih);
-		if (ret < 0)
-			goto new_sess_out;
-
+		iscsi_post_login_handler(np, conn, zero_tsih);
 		iscsit_deaccess_np(np, tpg, tpg_np);
 	}
 
diff --git a/drivers/target/iscsi/iscsi_target_login.h b/drivers/target/iscsi/iscsi_target_login.h
index 1c7358081533..57aa0d0fd820 100644
--- a/drivers/target/iscsi/iscsi_target_login.h
+++ b/drivers/target/iscsi/iscsi_target_login.h
@@ -12,7 +12,8 @@ extern int iscsit_accept_np(struct iscsi_np *, struct iscsi_conn *);
 extern int iscsit_get_login_rx(struct iscsi_conn *, struct iscsi_login *);
 extern int iscsit_put_login_tx(struct iscsi_conn *, struct iscsi_login *, u32);
 extern void iscsit_free_conn(struct iscsi_np *, struct iscsi_conn *);
-extern int iscsi_post_login_handler(struct iscsi_np *, struct iscsi_conn *, u8);
+extern int iscsit_start_kthreads(struct iscsi_conn *);
+extern void iscsi_post_login_handler(struct iscsi_np *, struct iscsi_conn *, u8);
 extern void iscsi_target_login_sess_out(struct iscsi_conn *, struct iscsi_np *,
 				bool, bool);
 extern int iscsi_target_login_thread(void *);
diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
index 8c02fa34716f..f9cde9141836 100644
--- a/drivers/target/iscsi/iscsi_target_nego.c
+++ b/drivers/target/iscsi/iscsi_target_nego.c
@@ -17,6 +17,7 @@
  ******************************************************************************/
 
 #include <linux/ctype.h>
+#include <linux/kthread.h>
 #include <scsi/iscsi_proto.h>
 #include <target/target_core_base.h>
 #include <target/target_core_fabric.h>
@@ -361,10 +362,24 @@ static int iscsi_target_do_tx_login_io(struct iscsi_conn *conn, struct iscsi_log
 		ntohl(login_rsp->statsn), login->rsp_length);
 
 	padding = ((-login->rsp_length) & 3);
+	/*
+	 * Before sending the last login response containing the transition
+	 * bit for full-feature-phase, go ahead and start up TX/RX threads
+	 * now to avoid potential resource allocation failures after the
+	 * final login response has been sent.
+	 */
+	if (login->login_complete) {
+		int rc = iscsit_start_kthreads(conn);
+		if (rc) {
+			iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR,
+					    ISCSI_LOGIN_STATUS_NO_RESOURCES);
+			return -1;
+		}
+	}
 
 	if (conn->conn_transport->iscsit_put_login_tx(conn, login,
 					login->rsp_length + padding) < 0)
-		return -1;
+		goto err;
 
 	login->rsp_length		= 0;
 	mutex_lock(&sess->cmdsn_mutex);
@@ -373,6 +388,23 @@ static int iscsi_target_do_tx_login_io(struct iscsi_conn *conn, struct iscsi_log
 	mutex_unlock(&sess->cmdsn_mutex);
 
 	return 0;
+
+err:
+	if (login->login_complete) {
+		if (conn->rx_thread && conn->rx_thread_active) {
+			send_sig(SIGINT, conn->rx_thread, 1);
+			kthread_stop(conn->rx_thread);
+		}
+		if (conn->tx_thread && conn->tx_thread_active) {
+			send_sig(SIGINT, conn->tx_thread, 1);
+			kthread_stop(conn->tx_thread);
+		}
+		spin_lock(&iscsit_global->ts_bitmap_lock);
+		bitmap_release_region(iscsit_global->ts_bitmap, conn->bitmap_id,
+				      get_order(1));
+		spin_unlock(&iscsit_global->ts_bitmap_lock);
+	}
+	return -1;
 }
 
 static void iscsi_target_sk_data_ready(struct sock *sk)
diff --git a/include/target/iscsi/iscsi_target_core.h b/include/target/iscsi/iscsi_target_core.h
index 34117b8b72e4..0aedbb2c10e0 100644
--- a/include/target/iscsi/iscsi_target_core.h
+++ b/include/target/iscsi/iscsi_target_core.h
@@ -595,6 +595,7 @@ struct iscsi_conn {
 	int			bitmap_id;
 	int			rx_thread_active;
 	struct task_struct	*rx_thread;
+	struct completion	rx_login_comp;
 	int			tx_thread_active;
 	struct task_struct	*tx_thread;
 	/* list_head for session connection list */
-- 
cgit v1.2.3-71-gd317


From 2392debc2be721a7d5b907cbcbc0ebb858dead01 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 22 Jul 2015 10:43:23 +0300
Subject: ipv4: consider TOS in fib_select_default

fib_select_default considers alternative routes only when
res->fi is for the first alias in res->fa_head. In the
common case this can happen only when the initial lookup
matches the first alias with highest TOS value. This
prevents the alternative routes to require specific TOS.

This patch solves the problem as follows:

- routes that require specific TOS should be returned by
fib_select_default only when TOS matches, as already done
in fib_table_lookup. This rule implies that depending on the
TOS we can have many different lists of alternative gateways
and we have to keep the last used gateway (fa_default) in first
alias for the TOS instead of using single tb_default value.

- as the aliases are ordered by many keys (TOS desc,
fib_priority asc), we restrict the possible results to
routes with matching TOS and lowest metric (fib_priority)
and routes that match any TOS, again with lowest metric.

For example, packet with TOS 8 can not use gw3 (not lowest
metric), gw4 (different TOS) and gw6 (not lowest metric),
all other gateways can be used:

tos 8 via gw1 metric 2 <--- res->fa_head and res->fi
tos 8 via gw2 metric 2
tos 8 via gw3 metric 3
tos 4 via gw4
tos 0 via gw5
tos 0 via gw6 metric 1

Reported-by: Hagen Paul Pfeifer <hagen@jauu.net>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  3 +--
 net/ipv4/fib_lookup.h    |  1 +
 net/ipv4/fib_semantics.c | 36 +++++++++++++++++++++++++-----------
 net/ipv4/fib_trie.c      |  3 ++-
 net/ipv4/route.c         |  2 +-
 5 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 49c142bdf01e..5fa643b4e891 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -183,7 +183,6 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
 struct fib_table {
 	struct hlist_node	tb_hlist;
 	u32			tb_id;
-	int			tb_default;
 	int			tb_num_default;
 	struct rcu_head		rcu;
 	unsigned long 		*tb_data;
@@ -290,7 +289,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb);
 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			u8 tos, int oif, struct net_device *dev,
 			struct in_device *idev, u32 *itag);
-void fib_select_default(struct fib_result *res);
+void fib_select_default(const struct flowi4 *flp, struct fib_result *res);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 static inline int fib_num_tclassid_users(struct net *net)
 {
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c6211ed60b03..9c02920725db 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -13,6 +13,7 @@ struct fib_alias {
 	u8			fa_state;
 	u8			fa_slen;
 	u32			tb_id;
+	s16			fa_default;
 	struct rcu_head		rcu;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e1079583b8b7..3a06586b170c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1202,28 +1202,40 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event)
 }
 
 /* Must be invoked inside of an RCU protected region.  */
-void fib_select_default(struct fib_result *res)
+void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 {
 	struct fib_info *fi = NULL, *last_resort = NULL;
 	struct hlist_head *fa_head = res->fa_head;
 	struct fib_table *tb = res->table;
 	u8 slen = 32 - res->prefixlen;
 	int order = -1, last_idx = -1;
-	struct fib_alias *fa;
+	struct fib_alias *fa, *fa1 = NULL;
+	u32 last_prio = res->fi->fib_priority;
+	u8 last_tos = 0;
 
 	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
 		struct fib_info *next_fi = fa->fa_info;
 
 		if (fa->fa_slen != slen)
 			continue;
+		if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+			continue;
 		if (fa->tb_id != tb->tb_id)
 			continue;
+		if (next_fi->fib_priority > last_prio &&
+		    fa->fa_tos == last_tos) {
+			if (last_tos)
+				continue;
+			break;
+		}
+		if (next_fi->fib_flags & RTNH_F_DEAD)
+			continue;
+		last_tos = fa->fa_tos;
+		last_prio = next_fi->fib_priority;
+
 		if (next_fi->fib_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-
-		if (next_fi->fib_priority > res->fi->fib_priority)
-			break;
 		if (!next_fi->fib_nh[0].nh_gw ||
 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 			continue;
@@ -1233,10 +1245,11 @@ void fib_select_default(struct fib_result *res)
 		if (!fi) {
 			if (next_fi != res->fi)
 				break;
+			fa1 = fa;
 		} else if (!fib_detect_death(fi, order, &last_resort,
-					     &last_idx, tb->tb_default)) {
+					     &last_idx, fa1->fa_default)) {
 			fib_result_assign(res, fi);
-			tb->tb_default = order;
+			fa1->fa_default = order;
 			goto out;
 		}
 		fi = next_fi;
@@ -1244,20 +1257,21 @@ void fib_select_default(struct fib_result *res)
 	}
 
 	if (order <= 0 || !fi) {
-		tb->tb_default = -1;
+		if (fa1)
+			fa1->fa_default = -1;
 		goto out;
 	}
 
 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
-				tb->tb_default)) {
+			      fa1->fa_default)) {
 		fib_result_assign(res, fi);
-		tb->tb_default = order;
+		fa1->fa_default = order;
 		goto out;
 	}
 
 	if (last_idx >= 0)
 		fib_result_assign(res, last_resort);
-	tb->tb_default = last_idx;
+	fa1->fa_default = last_idx;
 out:
 	return;
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 15d32612e3c6..81797e065b21 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1171,6 +1171,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 			new_fa->fa_state = state & ~FA_S_ACCESSED;
 			new_fa->fa_slen = fa->fa_slen;
 			new_fa->tb_id = tb->tb_id;
+			new_fa->fa_default = -1;
 
 			err = switchdev_fib_ipv4_add(key, plen, fi,
 						     new_fa->fa_tos,
@@ -1222,6 +1223,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	new_fa->fa_state = 0;
 	new_fa->fa_slen = slen;
 	new_fa->tb_id = tb->tb_id;
+	new_fa->fa_default = -1;
 
 	/* (Optionally) offload fib entry to switch hardware. */
 	err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
@@ -1990,7 +1992,6 @@ struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
 		return NULL;
 
 	tb->tb_id = id;
-	tb->tb_default = -1;
 	tb->tb_num_default = 0;
 	tb->tb_data = (alias ? alias->__data : tb->__data);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d0362a2de3d3..e681b852ced1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2176,7 +2176,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	if (!res.prefixlen &&
 	    res.table->tb_num_default > 1 &&
 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
-		fib_select_default(&res);
+		fib_select_default(fl4, &res);
 
 	if (!fl4->saddr)
 		fl4->saddr = FIB_RES_PREFSRC(net, res);
-- 
cgit v1.2.3-71-gd317


From d1fe19444d82e399e38c1594c71b850eca8e9de0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jul 2015 12:05:37 +0200
Subject: inet: frag: don't re-use chainlist for evictor

commit 65ba1f1ec0eff ("inet: frags: fix a race between inet_evict_bucket
and inet_frag_kill") describes the bug, but the fix doesn't work reliably.

Problem is that ->flags member can be set on other cpu without chainlock
being held by that task, i.e. the RMW-Cycle can clear INET_FRAG_EVICTED
bit after we put the element on the evictor private list.

We can crash when walking the 'private' evictor list since an element can
be deleted from list underneath the evictor.

Join work with Nikolay Alexandrov.

Fixes: b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue")
Reported-by: Johan Schuijt <johan@transip.nl>
Tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Nikolay Alexandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  | 2 ++
 net/ipv4/inet_fragment.c | 8 +++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index e1300b3dd597..56a3a5685f76 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -45,6 +45,7 @@ enum {
  * @flags: fragment queue flags
  * @max_size: maximum received fragment size
  * @net: namespace that this frag belongs to
+ * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
  */
 struct inet_frag_queue {
 	spinlock_t		lock;
@@ -59,6 +60,7 @@ struct inet_frag_queue {
 	__u8			flags;
 	u16			max_size;
 	struct netns_frags	*net;
+	struct hlist_node	list_evictor;
 };
 
 #define INETFRAGS_HASHSZ	1024
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5e346a082e5f..172234864fec 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -151,14 +151,13 @@ evict_again:
 		}
 
 		fq->flags |= INET_FRAG_EVICTED;
-		hlist_del(&fq->list);
-		hlist_add_head(&fq->list, &expired);
+		hlist_add_head(&fq->list_evictor, &expired);
 		++evicted;
 	}
 
 	spin_unlock(&hb->chain_lock);
 
-	hlist_for_each_entry_safe(fq, n, &expired, list)
+	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
 		f->frag_expire((unsigned long) fq);
 
 	return evicted;
@@ -284,8 +283,7 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 	struct inet_frag_bucket *hb;
 
 	hb = get_frag_bucket_locked(fq, f);
-	if (!(fq->flags & INET_FRAG_EVICTED))
-		hlist_del(&fq->list);
+	hlist_del(&fq->list);
 	spin_unlock(&hb->chain_lock);
 }
 
-- 
cgit v1.2.3-71-gd317


From 0e60d245a0be7fdbb723607f1d6621007916b252 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jul 2015 12:05:38 +0200
Subject: inet: frag: change *_frag_mem_limit functions to take netns_frags as
 argument

Followup patch will call it after inet_frag_queue was freed, so q->net
doesn't work anymore (but netf = q->net; free(q); mem_limit(netf) would).

Tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 |  8 ++++----
 net/ieee802154/6lowpan/reassembly.c     |  6 +++---
 net/ipv4/inet_fragment.c                |  4 ++--
 net/ipv4/ip_fragment.c                  | 10 +++++-----
 net/ipv6/netfilter/nf_conntrack_reasm.c |  6 +++---
 net/ipv6/reassembly.c                   |  6 +++---
 6 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 56a3a5685f76..e71ca17024f2 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -141,14 +141,14 @@ static inline int frag_mem_limit(struct netns_frags *nf)
 	return percpu_counter_read(&nf->mem);
 }
 
-static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i)
+static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	__percpu_counter_add(&q->net->mem, -i, frag_percpu_counter_batch);
+	__percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
 }
 
-static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
+static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	__percpu_counter_add(&q->net->mem, i, frag_percpu_counter_batch);
+	__percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch);
 }
 
 static inline void init_frag_mem_limit(struct netns_frags *nf)
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index f46e4d1306f2..214d44aef35b 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -207,7 +207,7 @@ found:
 	} else {
 		fq->q.meat += skb->len;
 	}
-	add_frag_mem_limit(&fq->q, skb->truesize);
+	add_frag_mem_limit(fq->q.net, skb->truesize);
 
 	if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    fq->q.meat == fq->q.len) {
@@ -287,7 +287,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 		clone->data_len = clone->len;
 		head->data_len -= clone->len;
 		head->len -= clone->len;
-		add_frag_mem_limit(&fq->q, clone->truesize);
+		add_frag_mem_limit(fq->q.net, clone->truesize);
 	}
 
 	WARN_ON(head == NULL);
@@ -310,7 +310,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	sub_frag_mem_limit(&fq->q, sum_truesize);
+	sub_frag_mem_limit(fq->q.net, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 172234864fec..4473232e4e88 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -328,7 +328,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 		fp = xp;
 	}
 	sum = sum_truesize + f->qsize;
-	sub_frag_mem_limit(q, sum);
+	sub_frag_mem_limit(q->net, sum);
 
 	if (f->destructor)
 		f->destructor(q);
@@ -388,7 +388,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 
 	q->net = nf;
 	f->constructor(q, arg);
-	add_frag_mem_limit(q, f->qsize);
+	add_frag_mem_limit(nf, f->qsize);
 
 	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
 	spin_lock_init(&q->lock);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 31f71b15cfba..b4a77d021c0d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -309,7 +309,7 @@ static int ip_frag_reinit(struct ipq *qp)
 		kfree_skb(fp);
 		fp = xp;
 	} while (fp);
-	sub_frag_mem_limit(&qp->q, sum_truesize);
+	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	qp->q.flags = 0;
 	qp->q.len = 0;
@@ -455,7 +455,7 @@ found:
 				qp->q.fragments = next;
 
 			qp->q.meat -= free_it->len;
-			sub_frag_mem_limit(&qp->q, free_it->truesize);
+			sub_frag_mem_limit(qp->q.net, free_it->truesize);
 			kfree_skb(free_it);
 		}
 	}
@@ -479,7 +479,7 @@ found:
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	qp->ecn |= ecn;
-	add_frag_mem_limit(&qp->q, skb->truesize);
+	add_frag_mem_limit(qp->q.net, skb->truesize);
 	if (offset == 0)
 		qp->q.flags |= INET_FRAG_FIRST_IN;
 
@@ -587,7 +587,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		head->len -= clone->len;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
-		add_frag_mem_limit(&qp->q, clone->truesize);
+		add_frag_mem_limit(qp->q.net, clone->truesize);
 	}
 
 	skb_push(head, head->data - skb_network_header(head));
@@ -615,7 +615,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	sub_frag_mem_limit(&qp->q, sum_truesize);
+	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6f187c8d8a1b..6d02498172c1 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -348,7 +348,7 @@ found:
 	fq->ecn |= ecn;
 	if (payload_len > fq->q.max_size)
 		fq->q.max_size = payload_len;
-	add_frag_mem_limit(&fq->q, skb->truesize);
+	add_frag_mem_limit(fq->q.net, skb->truesize);
 
 	/* The first fragment.
 	 * nhoffset is obtained from the first fragment, of course.
@@ -430,7 +430,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 		clone->ip_summed = head->ip_summed;
 
 		NFCT_FRAG6_CB(clone)->orig = NULL;
-		add_frag_mem_limit(&fq->q, clone->truesize);
+		add_frag_mem_limit(fq->q.net, clone->truesize);
 	}
 
 	/* We have to remove fragment header from datagram and to relocate
@@ -454,7 +454,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
 	}
-	sub_frag_mem_limit(&fq->q, head->truesize);
+	sub_frag_mem_limit(fq->q.net, head->truesize);
 
 	head->ignore_df = 1;
 	head->next = NULL;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 8ffa2c8cce77..5c3bbca6a150 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -330,7 +330,7 @@ found:
 	fq->q.stamp = skb->tstamp;
 	fq->q.meat += skb->len;
 	fq->ecn |= ecn;
-	add_frag_mem_limit(&fq->q, skb->truesize);
+	add_frag_mem_limit(fq->q.net, skb->truesize);
 
 	/* The first fragment.
 	 * nhoffset is obtained from the first fragment, of course.
@@ -443,7 +443,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 		head->len -= clone->len;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
-		add_frag_mem_limit(&fq->q, clone->truesize);
+		add_frag_mem_limit(fq->q.net, clone->truesize);
 	}
 
 	/* We have to remove fragment header from datagram and to relocate
@@ -481,7 +481,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	sub_frag_mem_limit(&fq->q, sum_truesize);
+	sub_frag_mem_limit(fq->q.net, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;
-- 
cgit v1.2.3-71-gd317


From caaecdd3d3f8ec0ea9906c54b1dd8ec8316d26b9 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 23 Jul 2015 12:05:40 +0200
Subject: inet: frags: remove INET_FRAG_EVICTED and use list_evictor for the
 test

We can simply remove the INET_FRAG_EVICTED flag to avoid all the flags
race conditions with the evictor and use a participation test for the
evictor list, when we're at that point (after inet_frag_kill) in the
timer there're 2 possible cases:

1. The evictor added the entry to its evictor list while the timer was
waiting for the chainlock
or
2. The timer unchained the entry and the evictor won't see it

In both cases we should be able to see list_evictor correctly due
to the sync on the chainlock.

Joint work with Florian Westphal.

Tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  | 7 +++++--
 net/ipv4/inet_fragment.c | 1 -
 net/ipv4/ip_fragment.c   | 2 +-
 net/ipv6/reassembly.c    | 2 +-
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index e71ca17024f2..53eead2da743 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -21,13 +21,11 @@ struct netns_frags {
  * @INET_FRAG_FIRST_IN: first fragment has arrived
  * @INET_FRAG_LAST_IN: final fragment has arrived
  * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
- * @INET_FRAG_EVICTED: frag queue is being evicted
  */
 enum {
 	INET_FRAG_FIRST_IN	= BIT(0),
 	INET_FRAG_LAST_IN	= BIT(1),
 	INET_FRAG_COMPLETE	= BIT(2),
-	INET_FRAG_EVICTED	= BIT(3)
 };
 
 /**
@@ -127,6 +125,11 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f
 		inet_frag_destroy(q, f);
 }
 
+static inline bool inet_frag_evicting(struct inet_frag_queue *q)
+{
+	return !hlist_unhashed(&q->list_evictor);
+}
+
 /* Memory Tracking Functions. */
 
 /* The default percpu_counter batch size is not big enough to scale to
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index a00ca4c00c35..d0a7c0319e3d 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -140,7 +140,6 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 		if (!del_timer(&fq->timer))
 			continue;
 
-		fq->flags |= INET_FRAG_EVICTED;
 		hlist_add_head(&fq->list_evictor, &expired);
 		++evicted;
 	}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b4a77d021c0d..921138f6c97c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -202,7 +202,7 @@ static void ip_expire(unsigned long arg)
 	ipq_kill(qp);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 
-	if (!(qp->q.flags & INET_FRAG_EVICTED)) {
+	if (!inet_frag_evicting(&qp->q)) {
 		struct sk_buff *head = qp->q.fragments;
 		const struct iphdr *iph;
 		int err;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5c3bbca6a150..f1159bb76e0a 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -144,7 +144,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 
-	if (fq->q.flags & INET_FRAG_EVICTED)
+	if (inet_frag_evicting(&fq->q))
 		goto out_rcu_unlock;
 
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
-- 
cgit v1.2.3-71-gd317


From dfbafc995304ebb9a9b03f65083e6e9cea143b20 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 24 Jul 2015 18:19:25 +0200
Subject: tcp: fix recv with flags MSG_WAITALL | MSG_PEEK

Currently, tcp_recvmsg enters a busy loop in sk_wait_data if called
with flags = MSG_WAITALL | MSG_PEEK.

sk_wait_data waits for sk_receive_queue not empty, but in this case,
the receive queue is not empty, but does not contain any skb that we
can use.

Add a "last skb seen on receive queue" argument to sk_wait_data, so
that it sleeps until the receive queue has new skbs.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=99461
Link: https://sourceware.org/bugzilla/show_bug.cgi?id=18493
Link: https://bugzilla.redhat.com/show_bug.cgi?id=1205258
Reported-by: Enrico Scholz <rh-bugzilla@ensc.de>
Reported-by: Dan Searle <dan@censornet.com>
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h |  2 +-
 net/core/sock.c    |  5 +++--
 net/dccp/proto.c   |  2 +-
 net/ipv4/tcp.c     | 11 +++++++----
 net/llc/af_llc.c   |  4 ++--
 5 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 05a8c1aea251..f21f0708ec59 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -902,7 +902,7 @@ void sk_stream_kill_queues(struct sock *sk);
 void sk_set_memalloc(struct sock *sk);
 void sk_clear_memalloc(struct sock *sk);
 
-int sk_wait_data(struct sock *sk, long *timeo);
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);
 
 struct request_sock_ops;
 struct timewait_sock_ops;
diff --git a/net/core/sock.c b/net/core/sock.c
index 08f16db46070..8a14f1285fc4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1967,20 +1967,21 @@ static void __release_sock(struct sock *sk)
  * sk_wait_data - wait for data to arrive at sk_receive_queue
  * @sk:    sock to wait on
  * @timeo: for how long
+ * @skb:   last skb seen on sk_receive_queue
  *
  * Now socket state including sk->sk_err is changed only under lock,
  * hence we may omit checks after joining wait queue.
  * We check receive queue before schedule() only as optimization;
  * it is very likely that release_sock() added new data.
  */
-int sk_wait_data(struct sock *sk, long *timeo)
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
 {
 	int rc;
 	DEFINE_WAIT(wait);
 
 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
-	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
+	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
 	finish_wait(sk_sleep(sk), &wait);
 	return rc;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 52a94016526d..b5cf13a28009 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -886,7 +886,7 @@ verify_sock_status:
 			break;
 		}
 
-		sk_wait_data(sk, &timeo);
+		sk_wait_data(sk, &timeo, NULL);
 		continue;
 	found_ok_skb:
 		if (len > skb->len)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7f4056785acc..45534a5ab430 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -780,7 +780,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 				ret = -EAGAIN;
 				break;
 			}
-			sk_wait_data(sk, &timeo);
+			sk_wait_data(sk, &timeo, NULL);
 			if (signal_pending(current)) {
 				ret = sock_intr_errno(timeo);
 				break;
@@ -1575,7 +1575,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct task_struct *user_recv = NULL;
-	struct sk_buff *skb;
+	struct sk_buff *skb, *last;
 	u32 urg_hole = 0;
 
 	if (unlikely(flags & MSG_ERRQUEUE))
@@ -1635,7 +1635,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 
 		/* Next get a buffer. */
 
+		last = skb_peek_tail(&sk->sk_receive_queue);
 		skb_queue_walk(&sk->sk_receive_queue, skb) {
+			last = skb;
 			/* Now that we have two receive queues this
 			 * shouldn't happen.
 			 */
@@ -1754,8 +1756,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 			/* Do not sleep, just process backlog. */
 			release_sock(sk);
 			lock_sock(sk);
-		} else
-			sk_wait_data(sk, &timeo);
+		} else {
+			sk_wait_data(sk, &timeo, last);
+		}
 
 		if (user_recv) {
 			int chunk;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 8fd9febaa5ba..8dab4e569571 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -613,7 +613,7 @@ static int llc_wait_data(struct sock *sk, long timeo)
 		if (signal_pending(current))
 			break;
 		rc = 0;
-		if (sk_wait_data(sk, &timeo))
+		if (sk_wait_data(sk, &timeo, NULL))
 			break;
 	}
 	return rc;
@@ -802,7 +802,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			release_sock(sk);
 			lock_sock(sk);
 		} else
-			sk_wait_data(sk, &timeo);
+			sk_wait_data(sk, &timeo, NULL);
 
 		if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) {
 			net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n",
-- 
cgit v1.2.3-71-gd317


From e018a0cce3d849bc73e72686c571420adc40bad2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 24 Jul 2015 21:24:04 +0300
Subject: net/macb: convert to kernel doc

This patch coverts struct description to the kernel doc format. There is no
functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/macb.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/macb.h b/include/linux/platform_data/macb.h
index 044a124bfbbc..21b15f6fee25 100644
--- a/include/linux/platform_data/macb.h
+++ b/include/linux/platform_data/macb.h
@@ -8,11 +8,19 @@
 #ifndef __MACB_PDATA_H__
 #define __MACB_PDATA_H__
 
+/**
+ * struct macb_platform_data - platform data for MACB Ethernet
+ * @phy_mask:		phy mask passed when register the MDIO bus
+ *			within the driver
+ * @phy_irq_pin:	PHY IRQ
+ * @is_rmii:		using RMII interface?
+ * @rev_eth_addr:	reverse Ethernet address byte order
+ */
 struct macb_platform_data {
 	u32		phy_mask;
-	int		phy_irq_pin;	/* PHY IRQ */
-	u8		is_rmii;	/* using RMII interface? */
-	u8		rev_eth_addr;	/* reverse Ethernet address byte order */
+	int		phy_irq_pin;
+	u8		is_rmii;
+	u8		rev_eth_addr;
 };
 
 #endif /* __MACB_PDATA_H__ */
-- 
cgit v1.2.3-71-gd317


From 35068ce8cbf1749ef1a4b9b1493af83b8488c37b Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Date: Wed, 1 Jul 2015 09:10:43 +0200
Subject: of: constify drv arg of of_driver_match_device stub

With this change the stub has the same signature as the actual function,
preventing this compiler warning when building without CONFIG_OF:

   drivers/base/property.c: In function 'fwnode_driver_match_device':
>> drivers/base/property.c:608:38: warning: passing argument 2 of 'of_driver_match_device' discards 'const' qualifier from pointer target type
      return of_driver_match_device(dev, drv);
                                         ^
   In file included from drivers/base/property.c:18:0:
   include/linux/of_device.h:61:19: note: expected 'struct device_driver *' but argument is of type 'const struct device_driver *'
    static inline int of_driver_match_device(struct device *dev,
                      ^

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 4c508549833a..cc7dd687a89d 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -59,7 +59,7 @@ void of_dma_configure(struct device *dev, struct device_node *np);
 #else /* CONFIG_OF */
 
 static inline int of_driver_match_device(struct device *dev,
-					 struct device_driver *drv)
+					 const struct device_driver *drv)
 {
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From 559ed40752dc63e68f9b9ad301b20e6a3fe5cf21 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 26 Jul 2015 02:07:47 +0200
Subject: cpufreq: Avoid attempts to create duplicate symbolic links

After commit 87549141d516 (cpufreq: Stop migrating sysfs files on
hotplug) there is a problem with CPUs that share cpufreq policy
objects with other CPUs and are initially offline.

Say CPU1 shares a policy with CPU0 which is online and is registered
first.  As part of the registration process, cpufreq_add_dev() is
called for it.  It creates the policy object and a symbolic link
to it from the CPU1's sysfs directory.  If CPU1 is registered
subsequently and it is offline at that time, cpufreq_add_dev() will
attempt to create a symbolic link to the policy object for it, but
that link is present already, so a warning about that will be
triggered.

To avoid that warning, make cpufreq use an additional CPU mask
containing related CPUs that are actually present for each policy
object.  That mask is initialized when the policy object is populated
after its creation (for the first online CPU using it) and it includes
CPUs from the "policy CPUs" mask returned by the cpufreq driver's
->init() callback that are physically present at that time.  Symbolic
links to the policy are created only for the CPUs in that mask.

If cpufreq_add_dev() is invoked for an offline CPU, it checks the
new mask and only creates the symlink if the CPU was not in it (the
CPU is added to the mask at the same time).

In turn, cpufreq_remove_dev() drops the given CPU from the new mask,
removes its symlink to the policy object and returns, unless it is
the CPU owning the policy object.  In that case, the policy object
is moved to a new CPU's sysfs directory or deleted if the CPU being
removed was the last user of the policy.

While at it, notice that cpufreq_remove_dev() can't fail, because
its return value is ignored, so make it ignore return values from
__cpufreq_remove_dev_prepare() and __cpufreq_remove_dev_finish()
and prevent these functions from aborting on errors returned by
__cpufreq_governor().  Also drop the now unused sif argument from
them.

Fixes: 87549141d516 (cpufreq: Stop migrating sysfs files on hotplug)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reported-and-tested-by: Russell King <linux@arm.linux.org.uk>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 108 +++++++++++++++++++++++-----------------------
 include/linux/cpufreq.h   |   1 +
 2 files changed, 56 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 26063afb3eba..7a3c30c4336f 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1002,7 +1002,7 @@ static int cpufreq_add_dev_symlink(struct cpufreq_policy *policy)
 	int ret = 0;
 
 	/* Some related CPUs might not be present (physically hotplugged) */
-	for_each_cpu_and(j, policy->related_cpus, cpu_present_mask) {
+	for_each_cpu(j, policy->real_cpus) {
 		if (j == policy->kobj_cpu)
 			continue;
 
@@ -1019,7 +1019,7 @@ static void cpufreq_remove_dev_symlink(struct cpufreq_policy *policy)
 	unsigned int j;
 
 	/* Some related CPUs might not be present (physically hotplugged) */
-	for_each_cpu_and(j, policy->related_cpus, cpu_present_mask) {
+	for_each_cpu(j, policy->real_cpus) {
 		if (j == policy->kobj_cpu)
 			continue;
 
@@ -1163,11 +1163,14 @@ static struct cpufreq_policy *cpufreq_policy_alloc(struct device *dev)
 	if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL))
 		goto err_free_cpumask;
 
+	if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL))
+		goto err_free_rcpumask;
+
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &dev->kobj,
 				   "cpufreq");
 	if (ret) {
 		pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
-		goto err_free_rcpumask;
+		goto err_free_real_cpus;
 	}
 
 	INIT_LIST_HEAD(&policy->policy_list);
@@ -1184,6 +1187,8 @@ static struct cpufreq_policy *cpufreq_policy_alloc(struct device *dev)
 
 	return policy;
 
+err_free_real_cpus:
+	free_cpumask_var(policy->real_cpus);
 err_free_rcpumask:
 	free_cpumask_var(policy->related_cpus);
 err_free_cpumask:
@@ -1234,6 +1239,7 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy, bool notify)
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
 	cpufreq_policy_put_kobj(policy, notify);
+	free_cpumask_var(policy->real_cpus);
 	free_cpumask_var(policy->related_cpus);
 	free_cpumask_var(policy->cpus);
 	kfree(policy);
@@ -1258,14 +1264,17 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 
 	pr_debug("adding CPU %u\n", cpu);
 
-	/*
-	 * Only possible if 'cpu' wasn't physically present earlier and we are
-	 * here from subsys_interface add callback. A hotplug notifier will
-	 * follow and we will handle it like logical CPU hotplug then. For now,
-	 * just create the sysfs link.
-	 */
-	if (cpu_is_offline(cpu))
-		return add_cpu_dev_symlink(per_cpu(cpufreq_cpu_data, cpu), cpu);
+	if (cpu_is_offline(cpu)) {
+		/*
+		 * Only possible if we are here from the subsys_interface add
+		 * callback.  A hotplug notifier will follow and we will handle
+		 * it as CPU online then.  For now, just create the sysfs link,
+		 * unless there is no policy or the link is already present.
+		 */
+		policy = per_cpu(cpufreq_cpu_data, cpu);
+		return policy && !cpumask_test_and_set_cpu(cpu, policy->real_cpus)
+			? add_cpu_dev_symlink(policy, cpu) : 0;
+	}
 
 	if (!down_read_trylock(&cpufreq_rwsem))
 		return 0;
@@ -1307,6 +1316,10 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 	/* related cpus should atleast have policy->cpus */
 	cpumask_or(policy->related_cpus, policy->related_cpus, policy->cpus);
 
+	/* Remember which CPUs have been present at the policy creation time. */
+	if (!recover_policy)
+		cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask);
+
 	/*
 	 * affected cpus must always be the one, which are online. We aren't
 	 * managing offline cpus here.
@@ -1420,8 +1433,7 @@ nomem_out:
 	return ret;
 }
 
-static int __cpufreq_remove_dev_prepare(struct device *dev,
-					struct subsys_interface *sif)
+static int __cpufreq_remove_dev_prepare(struct device *dev)
 {
 	unsigned int cpu = dev->id;
 	int ret = 0;
@@ -1437,10 +1449,8 @@ static int __cpufreq_remove_dev_prepare(struct device *dev,
 
 	if (has_target()) {
 		ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
-		if (ret) {
+		if (ret)
 			pr_err("%s: Failed to stop governor\n", __func__);
-			return ret;
-		}
 	}
 
 	down_write(&policy->rwsem);
@@ -1473,8 +1483,7 @@ static int __cpufreq_remove_dev_prepare(struct device *dev,
 	return ret;
 }
 
-static int __cpufreq_remove_dev_finish(struct device *dev,
-				       struct subsys_interface *sif)
+static int __cpufreq_remove_dev_finish(struct device *dev)
 {
 	unsigned int cpu = dev->id;
 	int ret;
@@ -1492,10 +1501,8 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
 	/* If cpu is last user of policy, free policy */
 	if (has_target()) {
 		ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
-		if (ret) {
+		if (ret)
 			pr_err("%s: Failed to exit governor\n", __func__);
-			return ret;
-		}
 	}
 
 	/*
@@ -1506,10 +1513,6 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
 	if (cpufreq_driver->exit)
 		cpufreq_driver->exit(policy);
 
-	/* Free the policy only if the driver is getting removed. */
-	if (sif)
-		cpufreq_policy_free(policy, true);
-
 	return 0;
 }
 
@@ -1521,42 +1524,41 @@ static int __cpufreq_remove_dev_finish(struct device *dev,
 static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
 	unsigned int cpu = dev->id;
-	int ret;
-
-	/*
-	 * Only possible if 'cpu' is getting physically removed now. A hotplug
-	 * notifier should have already been called and we just need to remove
-	 * link or free policy here.
-	 */
-	if (cpu_is_offline(cpu)) {
-		struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
-		struct cpumask mask;
+	struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
 
-		if (!policy)
-			return 0;
+	if (!policy)
+		return 0;
 
-		cpumask_copy(&mask, policy->related_cpus);
-		cpumask_clear_cpu(cpu, &mask);
+	if (cpu_online(cpu)) {
+		__cpufreq_remove_dev_prepare(dev);
+		__cpufreq_remove_dev_finish(dev);
+	}
 
-		/*
-		 * Free policy only if all policy->related_cpus are removed
-		 * physically.
-		 */
-		if (cpumask_intersects(&mask, cpu_present_mask)) {
-			remove_cpu_dev_symlink(policy, cpu);
-			return 0;
-		}
+	cpumask_clear_cpu(cpu, policy->real_cpus);
 
+	if (cpumask_empty(policy->real_cpus)) {
 		cpufreq_policy_free(policy, true);
 		return 0;
 	}
 
-	ret = __cpufreq_remove_dev_prepare(dev, sif);
+	if (cpu != policy->kobj_cpu) {
+		remove_cpu_dev_symlink(policy, cpu);
+	} else {
+		/*
+		 * The CPU owning the policy object is going away.  Move it to
+		 * another suitable CPU.
+		 */
+		unsigned int new_cpu = cpumask_first(policy->real_cpus);
+		struct device *new_dev = get_cpu_device(new_cpu);
+
+		dev_dbg(dev, "%s: Moving policy object to CPU%u\n", __func__, new_cpu);
 
-	if (!ret)
-		ret = __cpufreq_remove_dev_finish(dev, sif);
+		sysfs_remove_link(&new_dev->kobj, "cpufreq");
+		policy->kobj_cpu = new_cpu;
+		WARN_ON(kobject_move(&policy->kobj, &new_dev->kobj));
+	}
 
-	return ret;
+	return 0;
 }
 
 static void handle_update(struct work_struct *work)
@@ -2395,11 +2397,11 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
 			break;
 
 		case CPU_DOWN_PREPARE:
-			__cpufreq_remove_dev_prepare(dev, NULL);
+			__cpufreq_remove_dev_prepare(dev);
 			break;
 
 		case CPU_POST_DEAD:
-			__cpufreq_remove_dev_finish(dev, NULL);
+			__cpufreq_remove_dev_finish(dev);
 			break;
 
 		case CPU_DOWN_FAILED:
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 29ad97c34fd5..bde1e567b3a9 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -62,6 +62,7 @@ struct cpufreq_policy {
 	/* CPUs sharing clock, require sw coordination */
 	cpumask_var_t		cpus;	/* Online CPUs only */
 	cpumask_var_t		related_cpus; /* Online + Offline CPUs */
+	cpumask_var_t		real_cpus; /* Related and present */
 
 	unsigned int		shared_type; /* ACPI: ANY or ALL affected CPUs
 						should set cpufreq */
-- 
cgit v1.2.3-71-gd317


From e13af53e7d5a8cea8992d9b61fac69bd0ed8d845 Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel.daenzer@amd.com>
Date: Wed, 22 Jul 2015 17:29:00 +0900
Subject: drm/radeon: Drop drm/ prefix for including drm.h in radeon_drm.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows radeon_drm.h to be reused verbatim in libdrm.

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Michel Dänzer <michel.daenzer@amd.com>
---
 include/uapi/drm/radeon_drm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h
index 1ef76661e1a1..01aa2a8e3f8d 100644
--- a/include/uapi/drm/radeon_drm.h
+++ b/include/uapi/drm/radeon_drm.h
@@ -33,7 +33,7 @@
 #ifndef __RADEON_DRM_H__
 #define __RADEON_DRM_H__
 
-#include <drm/drm.h>
+#include "drm.h"
 
 /* WARNING: If you change any of these defines, make sure to change the
  * defines in the X server file (radeon_sarea.h)
-- 
cgit v1.2.3-71-gd317


From b3fcf36aeef3aeb890d9413c2066048ec7fda7e5 Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel.daenzer@amd.com>
Date: Wed, 22 Jul 2015 17:29:01 +0900
Subject: drm/amdgpu: Drop drm/ prefix for including drm.h in amdgpu_drm.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows amdgpu_drm.h to be reused verbatim in libdrm.

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Michel Dänzer <michel.daenzer@amd.com>
---
 include/uapi/drm/amdgpu_drm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index d708a53b8fb1..fbdd11851725 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -32,7 +32,7 @@
 #ifndef __AMDGPU_DRM_H__
 #define __AMDGPU_DRM_H__
 
-#include <drm/drm.h>
+#include "drm.h"
 
 #define DRM_AMDGPU_GEM_CREATE		0x00
 #define DRM_AMDGPU_GEM_MMAP		0x01
-- 
cgit v1.2.3-71-gd317


From 28e6b67f0b292f557468c139085303b15f1a678f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 29 Jul 2015 23:35:25 +0200
Subject: net: sched: fix refcount imbalance in actions

Since commit 55334a5db5cd ("net_sched: act: refuse to remove bound action
outside"), we end up with a wrong reference count for a tc action.

Test case 1:

  FOO="1,6 0 0 4294967295,"
  BAR="1,6 0 0 4294967294,"
  tc filter add dev foo parent 1: bpf bytecode "$FOO" flowid 1:1 \
     action bpf bytecode "$FOO"
  tc actions show action bpf
    action order 0: bpf bytecode '1,6 0 0 4294967295' default-action pipe
    index 1 ref 1 bind 1
  tc actions replace action bpf bytecode "$BAR" index 1
  tc actions show action bpf
    action order 0: bpf bytecode '1,6 0 0 4294967294' default-action pipe
    index 1 ref 2 bind 1
  tc actions replace action bpf bytecode "$FOO" index 1
  tc actions show action bpf
    action order 0: bpf bytecode '1,6 0 0 4294967295' default-action pipe
    index 1 ref 3 bind 1

Test case 2:

  FOO="1,6 0 0 4294967295,"
  tc filter add dev foo parent 1: bpf bytecode "$FOO" flowid 1:1 action ok
  tc actions show action gact
    action order 0: gact action pass
    random type none pass val 0
     index 1 ref 1 bind 1
  tc actions add action drop index 1
    RTNETLINK answers: File exists [...]
  tc actions show action gact
    action order 0: gact action pass
     random type none pass val 0
     index 1 ref 2 bind 1
  tc actions add action drop index 1
    RTNETLINK answers: File exists [...]
  tc actions show action gact
    action order 0: gact action pass
     random type none pass val 0
     index 1 ref 3 bind 1

What happens is that in tcf_hash_check(), we check tcf_common for a given
index and increase tcfc_refcnt and conditionally tcfc_bindcnt when we've
found an existing action. Now there are the following cases:

  1) We do a late binding of an action. In that case, we leave the
     tcfc_refcnt/tcfc_bindcnt increased and are done with the ->init()
     handler. This is correctly handeled.

  2) We replace the given action, or we try to add one without replacing
     and find out that the action at a specific index already exists
     (thus, we go out with error in that case).

In case of 2), we have to undo the reference count increase from
tcf_hash_check() in the tcf_hash_check() function. Currently, we fail to
do so because of the 'tcfc_bindcnt > 0' check which bails out early with
an -EPERM error.

Now, while commit 55334a5db5cd prevents 'tc actions del action ...' on an
already classifier-bound action to drop the reference count (which could
then become negative, wrap around etc), this restriction only accounts for
invocations outside a specific action's ->init() handler.

One possible solution would be to add a flag thus we possibly trigger
the -EPERM ony in situations where it is indeed relevant.

After the patch, above test cases have correct reference count again.

Fixes: 55334a5db5cd ("net_sched: act: refuse to remove bound action outside")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h |  8 +++++++-
 net/sched/act_api.c   | 11 ++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 3ee4c92afd1b..931738bc5bba 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -99,7 +99,6 @@ struct tc_action_ops {
 
 int tcf_hash_search(struct tc_action *a, u32 index);
 void tcf_hash_destroy(struct tc_action *a);
-int tcf_hash_release(struct tc_action *a, int bind);
 u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo);
 int tcf_hash_check(u32 index, struct tc_action *a, int bind);
 int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
@@ -107,6 +106,13 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est);
 void tcf_hash_insert(struct tc_action *a);
 
+int __tcf_hash_release(struct tc_action *a, bool bind, bool strict);
+
+static inline int tcf_hash_release(struct tc_action *a, bool bind)
+{
+	return __tcf_hash_release(a, bind, false);
+}
+
 int tcf_register_action(struct tc_action_ops *a, unsigned int mask);
 int tcf_unregister_action(struct tc_action_ops *a);
 int tcf_action_destroy(struct list_head *actions, int bind);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index af427a3dbcba..43ec92680ae8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -45,7 +45,7 @@ void tcf_hash_destroy(struct tc_action *a)
 }
 EXPORT_SYMBOL(tcf_hash_destroy);
 
-int tcf_hash_release(struct tc_action *a, int bind)
+int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 {
 	struct tcf_common *p = a->priv;
 	int ret = 0;
@@ -53,7 +53,7 @@ int tcf_hash_release(struct tc_action *a, int bind)
 	if (p) {
 		if (bind)
 			p->tcfc_bindcnt--;
-		else if (p->tcfc_bindcnt > 0)
+		else if (strict && p->tcfc_bindcnt > 0)
 			return -EPERM;
 
 		p->tcfc_refcnt--;
@@ -64,9 +64,10 @@ int tcf_hash_release(struct tc_action *a, int bind)
 			ret = 1;
 		}
 	}
+
 	return ret;
 }
-EXPORT_SYMBOL(tcf_hash_release);
+EXPORT_SYMBOL(__tcf_hash_release);
 
 static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
 			   struct tc_action *a)
@@ -136,7 +137,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)
 		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
 		hlist_for_each_entry_safe(p, n, head, tcfc_head) {
 			a->priv = p;
-			ret = tcf_hash_release(a, 0);
+			ret = __tcf_hash_release(a, false, true);
 			if (ret == ACT_P_DELETED) {
 				module_put(a->ops->owner);
 				n_i++;
@@ -408,7 +409,7 @@ int tcf_action_destroy(struct list_head *actions, int bind)
 	int ret = 0;
 
 	list_for_each_entry_safe(a, tmp, actions, list) {
-		ret = tcf_hash_release(a, bind);
+		ret = __tcf_hash_release(a, bind, true);
 		if (ret == ACT_P_DELETED)
 			module_put(a->ops->owner);
 		else if (ret < 0)
-- 
cgit v1.2.3-71-gd317


From 3b8a684bd6cbc13dfd21ca41814c304e9f27ec7f Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Mon, 3 Aug 2015 17:24:08 +0200
Subject: drm/atomic-helper: Add an atomice best_encoder callback

With legacy helpers all the routing was already set up when calling
best_encoder and so could be inspected. But with atomic it's staged,
hence we need a new atomic compliant callback for drivers which need
to inspect the requested state and can't just decided the best encoder
statically.

This is needed to fix up i915 dp mst where we need to pick the right
encoder depending upon the requested CRTC for the connector.

v2: Don't forget to amend the kerneldoc

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Acked-by: Thierry Reding <treding@nvidia.com>
Reviewed-by: Ander Conselvan de Oliveira <conselvan2@gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
---
 drivers/gpu/drm/drm_atomic_helper.c | 7 ++++++-
 include/drm/drm_crtc_helper.h       | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index aac212297b49..8694ca9beee3 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -196,7 +196,12 @@ update_connector_routing(struct drm_atomic_state *state, int conn_idx)
 	}
 
 	funcs = connector->helper_private;
-	new_encoder = funcs->best_encoder(connector);
+
+	if (funcs->atomic_best_encoder)
+		new_encoder = funcs->atomic_best_encoder(connector,
+							 connector_state);
+	else
+		new_encoder = funcs->best_encoder(connector);
 
 	if (!new_encoder) {
 		DRM_DEBUG_ATOMIC("No suitable encoder found for [CONNECTOR:%d:%s]\n",
diff --git a/include/drm/drm_crtc_helper.h b/include/drm/drm_crtc_helper.h
index c8fc187061de..918aa68b5199 100644
--- a/include/drm/drm_crtc_helper.h
+++ b/include/drm/drm_crtc_helper.h
@@ -168,6 +168,7 @@ struct drm_encoder_helper_funcs {
  * @get_modes: get mode list for this connector
  * @mode_valid: is this mode valid on the given connector? (optional)
  * @best_encoder: return the preferred encoder for this connector
+ * @atomic_best_encoder: atomic version of @best_encoder
  *
  * The helper operations are called by the mid-layer CRTC helper.
  */
@@ -176,6 +177,8 @@ struct drm_connector_helper_funcs {
 	enum drm_mode_status (*mode_valid)(struct drm_connector *connector,
 					   struct drm_display_mode *mode);
 	struct drm_encoder *(*best_encoder)(struct drm_connector *connector);
+	struct drm_encoder *(*atomic_best_encoder)(struct drm_connector *connector,
+						   struct drm_connector_state *connector_state);
 };
 
 extern void drm_helper_disable_unused_functions(struct drm_device *dev);
-- 
cgit v1.2.3-71-gd317


From 6dc6db790a67d28e46abefc44ca1a3bd438b2920 Mon Sep 17 00:00:00 2001
From: "Subhransu S. Prusty" <subhransu.s.prusty@intel.com>
Date: Mon, 29 Jun 2015 17:36:44 +0100
Subject: ASoC: topology: Add subsequence in topology

Some widgets may need sorting within, So add this support in topology.

Signed-off-by: Subhransu S. Prusty <subhransu.s.prusty@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 1 +
 sound/soc/soc-topology.c  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index 12215205ab8d..7ae13fbc0a3e 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -347,6 +347,7 @@ struct snd_soc_tplg_dapm_widget {
 	__le32 reg;		/* negative reg = no direct dapm */
 	__le32 shift;		/* bits to shift */
 	__le32 mask;		/* non-shifted mask */
+	__le32 subseq;		/* sort within widget type */
 	__u32 invert;		/* invert the power bit */
 	__u32 ignore_suspend;	/* kept enabled over suspend */
 	__u16 event_flags;
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index 6a547c6dd3a1..9f2b048f1071 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -1351,6 +1351,7 @@ static int soc_tplg_dapm_widget_create(struct soc_tplg *tplg,
 	template.reg = w->reg;
 	template.shift = w->shift;
 	template.mask = w->mask;
+	template.subseq = w->subseq;
 	template.on_val = w->invert ? 0 : 1;
 	template.off_val = w->invert ? 1 : 0;
 	template.ignore_suspend = w->ignore_suspend;
-- 
cgit v1.2.3-71-gd317


From c3879956957b8de9fd6cbad604e668fd00c6506c Mon Sep 17 00:00:00 2001
From: Vinod Koul <vinod.koul@intel.com>
Date: Mon, 29 Jun 2015 17:36:46 +0100
Subject: ASoC: topology: add private data to manifest

The topology file manifest should include a private data field. This
allows vendors to specify vendor data in the manifest, like
timestamps, hashes, additional information for removing platform
configuration out of drivers and making these configurable per platform

Signed-off-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index 7ae13fbc0a3e..d550c8d40269 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -238,6 +238,7 @@ struct snd_soc_tplg_manifest {
 	__le32 graph_elems;	/* number of graph elements */
 	__le32 dai_elems;	/* number of DAI elements */
 	__le32 dai_link_elems;	/* number of DAI link elements */
+	struct snd_soc_tplg_private priv;
 } __attribute__((packed));
 
 /*
-- 
cgit v1.2.3-71-gd317


From 28a87eebcad40101b1b273cbd4f2a02c104f9367 Mon Sep 17 00:00:00 2001
From: Mengdong Lin <mengdong.lin@intel.com>
Date: Wed, 5 Aug 2015 14:41:13 +0100
Subject: ASoC: topology: Update TLV support so we can support more TLV types

Currently the TLV topology structure is targeted at only supporting the
DB scale data. This patch extends support for the other TLV types so they
can be easily added at a later stage.

TLV structure is moved to common topology control header since it's a
common field for controls and can be processed in a general way.

Users must set a proper access flag for a control since it's used to
decide if the TLV field is valid and if a TLV callback is needed.

Removed the following fields from topology TLV struct:
- size/count: type can decide the size.
- numid: not needed to initialize TLV for kcontrol.
- data: replaced by the type specific struct.

Added TLV structure to generic control header and removed TLV structure
from mixer control.

Signed-off-by: Mengdong Lin <mengdong.lin@intel.com>
Signed-off-by: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 19 +++++++++++-----
 sound/soc/soc-topology.c  | 58 +++++++++++++++++++++++++++++++++--------------
 2 files changed, 54 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index 2819fc1f8458..aa3a79b42438 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -137,11 +137,19 @@ struct snd_soc_tplg_private {
 /*
  * Kcontrol TLV data.
  */
+struct snd_soc_tplg_tlv_dbscale {
+	__le32 min;
+	__le32 step;
+	__le32 mute;
+} __attribute__((packed));
+
 struct snd_soc_tplg_ctl_tlv {
-	__le32 size;	/* in bytes aligned to 4 */
-	__le32 numid;	/* control element numeric identification */
-	__le32 count;	/* number of elem in data array */
-	__le32 data[SND_SOC_TPLG_TLV_SIZE];
+	__le32 size;	/* in bytes of this structure */
+	__le32 type;	/* SNDRV_CTL_TLVT_*, type of TLV */
+	union {
+		__le32 data[SND_SOC_TPLG_TLV_SIZE];
+		struct snd_soc_tplg_tlv_dbscale scale;
+	};
 } __attribute__((packed));
 
 /*
@@ -172,7 +180,7 @@ struct snd_soc_tplg_ctl_hdr {
 	char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
 	__le32 access;
 	struct snd_soc_tplg_kcontrol_ops_id ops;
-	__le32 tlv_size;	/* non zero means control has TLV data */
+	struct snd_soc_tplg_ctl_tlv tlv;
 } __attribute__((packed));
 
 /*
@@ -260,7 +268,6 @@ struct snd_soc_tplg_mixer_control {
 	__le32 invert;
 	__le32 num_channels;
 	struct snd_soc_tplg_channel channel[SND_SOC_TPLG_MAX_CHAN];
-	struct snd_soc_tplg_ctl_tlv tlv;
 	struct snd_soc_tplg_private priv;
 } __attribute__((packed));
 
diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c
index 2c70f30d2d78..31068b8f3db0 100644
--- a/sound/soc/soc-topology.c
+++ b/sound/soc/soc-topology.c
@@ -33,6 +33,7 @@
 #include <sound/soc.h>
 #include <sound/soc-dapm.h>
 #include <sound/soc-topology.h>
+#include <sound/tlv.h>
 
 /*
  * We make several passes over the data (since it wont necessarily be ordered)
@@ -579,28 +580,51 @@ static int soc_tplg_init_kcontrol(struct soc_tplg *tplg,
 	return 0;
 }
 
+
+static int soc_tplg_create_tlv_db_scale(struct soc_tplg *tplg,
+	struct snd_kcontrol_new *kc, struct snd_soc_tplg_tlv_dbscale *scale)
+{
+	unsigned int item_len = 2 * sizeof(unsigned int);
+	unsigned int *p;
+
+	p = kzalloc(item_len + 2 * sizeof(unsigned int), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p[0] = SNDRV_CTL_TLVT_DB_SCALE;
+	p[1] = item_len;
+	p[2] = scale->min;
+	p[3] = (scale->step & TLV_DB_SCALE_MASK)
+			| (scale->mute ? TLV_DB_SCALE_MUTE : 0);
+
+	kc->tlv.p = (void *)p;
+	return 0;
+}
+
 static int soc_tplg_create_tlv(struct soc_tplg *tplg,
-	struct snd_kcontrol_new *kc, struct snd_soc_tplg_ctl_tlv *tplg_tlv)
+	struct snd_kcontrol_new *kc, struct snd_soc_tplg_ctl_hdr *tc)
 {
-	struct snd_ctl_tlv *tlv;
-	int size;
+	struct snd_soc_tplg_ctl_tlv *tplg_tlv;
 
-	if (tplg_tlv->count == 0)
+	if (!(tc->access & SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE))
 		return 0;
 
-	size = ((tplg_tlv->count + (sizeof(unsigned int) - 1)) &
-		~(sizeof(unsigned int) - 1));
-	tlv = kzalloc(sizeof(*tlv) + size, GFP_KERNEL);
-	if (tlv == NULL)
-		return -ENOMEM;
-
-	dev_dbg(tplg->dev, " created TLV type %d size %d bytes\n",
-		tplg_tlv->numid, size);
+	if (tc->access & SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK) {
+		kc->tlv.c = snd_soc_bytes_tlv_callback;
+	} else {
+		tplg_tlv = &tc->tlv;
+		switch (tplg_tlv->type) {
+		case SNDRV_CTL_TLVT_DB_SCALE:
+			return soc_tplg_create_tlv_db_scale(tplg, kc,
+					&tplg_tlv->scale);
 
-	tlv->numid = tplg_tlv->numid;
-	tlv->length = size;
-	memcpy(&tlv->tlv[0], tplg_tlv->data, size);
-	kc->tlv.p = (void *)tlv;
+		/* TODO: add support for other TLV types */
+		default:
+			dev_dbg(tplg->dev, "Unsupported TLV type %d\n",
+					tplg_tlv->type);
+			return -EINVAL;
+		}
+	}
 
 	return 0;
 }
@@ -772,7 +796,7 @@ static int soc_tplg_dmixer_create(struct soc_tplg *tplg, unsigned int count,
 		}
 
 		/* create any TLV data */
-		soc_tplg_create_tlv(tplg, &kc, &mc->tlv);
+		soc_tplg_create_tlv(tplg, &kc, &mc->hdr);
 
 		/* register control here */
 		err = soc_tplg_add_kcontrol(tplg, &kc,
-- 
cgit v1.2.3-71-gd317


From cb88498b36ab01cbe3a0d95cd097e4afdff4c6fd Mon Sep 17 00:00:00 2001
From: Mengdong Lin <mengdong.lin@intel.com>
Date: Wed, 5 Aug 2015 14:41:14 +0100
Subject: ASoC: topology: Add ops support to byte controls UAPI

Add UAPI support for setting byte control ops. Rename the ops structure
to be more generic so it can be sued by other objects too.

Signed-off-by: Mengdong Lin <mengdong.lin@intel.com>
Signed-off-by: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index aa3a79b42438..d5281ac8e5eb 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -163,9 +163,11 @@ struct snd_soc_tplg_channel {
 } __attribute__((packed));
 
 /*
- * Kcontrol Operations IDs
+ * Genericl Operations IDs, for binding Kcontrol or Bytes ext ops
+ * Kcontrol ops need get/put/info.
+ * Bytes ext ops need get/put.
  */
-struct snd_soc_tplg_kcontrol_ops_id {
+struct snd_soc_tplg_io_ops {
 	__le32 get;
 	__le32 put;
 	__le32 info;
@@ -179,7 +181,7 @@ struct snd_soc_tplg_ctl_hdr {
 	__le32 type;
 	char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
 	__le32 access;
-	struct snd_soc_tplg_kcontrol_ops_id ops;
+	struct snd_soc_tplg_io_ops ops;
 	struct snd_soc_tplg_ctl_tlv tlv;
 } __attribute__((packed));
 
@@ -311,6 +313,7 @@ struct snd_soc_tplg_bytes_control {
 	__le32 mask;
 	__le32 base;
 	__le32 num_regs;
+	struct snd_soc_tplg_io_ops ext_ops;
 	struct snd_soc_tplg_private priv;
 } __attribute__((packed));
 
-- 
cgit v1.2.3-71-gd317


From c7bcf8777a539e64dafc7417c00047aee6eb8909 Mon Sep 17 00:00:00 2001
From: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Date: Wed, 5 Aug 2015 14:41:15 +0100
Subject: ASoC: topology: Add private data type and bump ABI version to 3

Add ID for standalone private data object types and bump ABI version to
3 in order to userpsace features.

Signed-off-by: Liam Girdwood <liam.r.girdwood@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index d5281ac8e5eb..51b8066a223b 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -77,7 +77,7 @@
 #define SND_SOC_TPLG_NUM_TEXTS		16
 
 /* ABI version */
-#define SND_SOC_TPLG_ABI_VERSION	0x2
+#define SND_SOC_TPLG_ABI_VERSION	0x3
 
 /* Max size of TLV data */
 #define SND_SOC_TPLG_TLV_SIZE		32
@@ -97,7 +97,8 @@
 #define SND_SOC_TPLG_TYPE_PCM		7
 #define SND_SOC_TPLG_TYPE_MANIFEST	8
 #define SND_SOC_TPLG_TYPE_CODEC_LINK	9
-#define SND_SOC_TPLG_TYPE_MAX	SND_SOC_TPLG_TYPE_CODEC_LINK
+#define SND_SOC_TPLG_TYPE_PDATA		10
+#define SND_SOC_TPLG_TYPE_MAX	SND_SOC_TPLG_TYPE_PDATA
 
 /* vendor block IDs - please add new vendor types to end */
 #define SND_SOC_TPLG_TYPE_VENDOR_FW	1000
-- 
cgit v1.2.3-71-gd317


From 4248b0da460839e30eaaad78992b9a1dd3e63e21 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 6 Aug 2015 15:46:20 -0700
Subject: fs, file table: reinit files_stat.max_files after deferred memory
 initialisation

Dave Hansen reported the following;

	My laptop has been behaving strangely with 4.2-rc2.  Once I log
	in to my X session, I start getting all kinds of strange errors
	from applications and see this in my dmesg:

        	VFS: file-max limit 8192 reached

The problem is that the file-max is calculated before memory is fully
initialised and miscalculates how much memory the kernel is using.  This
patch recalculates file-max after deferred memory initialisation.  Note
that using memory hotplug infrastructure would not have avoided this
problem as the value is not recalculated after memory hot-add.

4.1:             files_stat.max_files = 6582781
4.2-rc2:         files_stat.max_files = 8192
4.2-rc2 patched: files_stat.max_files = 6562467

Small differences with the patch applied and 4.1 but not enough to matter.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reported-by: Dave Hansen <dave.hansen@intel.com>
Cc: Nicolai Stange <nicstange@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Alex Ng <alexng@microsoft.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c        | 13 +++----------
 fs/file_table.c    | 24 +++++++++++++++---------
 include/linux/fs.h |  5 +++--
 init/main.c        |  2 +-
 mm/page_alloc.c    |  3 +++
 5 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index 5c8ea15e73a5..9b5fe503f6cb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3442,22 +3442,15 @@ void __init vfs_caches_init_early(void)
 	inode_init_early();
 }
 
-void __init vfs_caches_init(unsigned long mempages)
+void __init vfs_caches_init(void)
 {
-	unsigned long reserve;
-
-	/* Base hash sizes on available memory, with a reserve equal to
-           150% of current kernel size */
-
-	reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
-	mempages -= reserve;
-
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	dcache_init();
 	inode_init();
-	files_init(mempages);
+	files_init();
+	files_maxfiles_init();
 	mnt_init();
 	bdev_cache_init();
 	chrdev_init();
diff --git a/fs/file_table.c b/fs/file_table.c
index 7f9d407c7595..ad17e05ebf95 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -25,6 +25,7 @@
 #include <linux/hardirq.h>
 #include <linux/task_work.h>
 #include <linux/ima.h>
+#include <linux/swap.h>
 
 #include <linux/atomic.h>
 
@@ -308,19 +309,24 @@ void put_filp(struct file *file)
 	}
 }
 
-void __init files_init(unsigned long mempages)
+void __init files_init(void)
 { 
-	unsigned long n;
-
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
+}
 
-	/*
-	 * One file with associated inode and dcache is very roughly 1K.
-	 * Per default don't use more than 10% of our memory for files. 
-	 */ 
+/*
+ * One file with associated inode and dcache is very roughly 1K. Per default
+ * do not use more than 10% of our memory for files.
+ */
+void __init files_maxfiles_init(void)
+{
+	unsigned long n;
+	unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+
+	memreserve = min(memreserve, totalram_pages - 1);
+	n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
 
-	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 } 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc008c338f5a..84b783f277f7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -55,7 +55,8 @@ struct vm_fault;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
-extern void __init files_init(unsigned long);
+extern void __init files_init(void);
+extern void __init files_maxfiles_init(void);
 
 extern struct files_stat_struct files_stat;
 extern unsigned long get_max_files(void);
@@ -2245,7 +2246,7 @@ extern int ioctl_preallocate(struct file *filp, void __user *argp);
 
 /* fs/dcache.c */
 extern void __init vfs_caches_init_early(void);
-extern void __init vfs_caches_init(unsigned long);
+extern void __init vfs_caches_init(void);
 
 extern struct kmem_cache *names_cachep;
 
diff --git a/init/main.c b/init/main.c
index c5d5626289ce..56506553d4d8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -656,7 +656,7 @@ asmlinkage __visible void __init start_kernel(void)
 	key_init();
 	security_init();
 	dbg_late_init();
-	vfs_caches_init(totalram_pages);
+	vfs_caches_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 322628278ae4..cb61f44eb3fc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1201,6 +1201,9 @@ void __init page_alloc_init_late(void)
 
 	/* Block until all are initialised */
 	wait_for_completion(&pgdat_init_all_done_comp);
+
+	/* Reinit limits that are based on free pages after the kernel is up */
+	files_maxfiles_init();
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-- 
cgit v1.2.3-71-gd317


From f4c18e6f7b5bbb5b528b3334115806b0d76f50f9 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 6 Aug 2015 15:47:08 -0700
Subject: mm: check __PG_HWPOISON separately from PAGE_FLAGS_CHECK_AT_*

The race condition addressed in commit add05cecef80 ("mm: soft-offline:
don't free target page in successful page migration") was not closed
completely, because that can happen not only for soft-offline, but also
for hard-offline.  Consider that a slab page is about to be freed into
buddy pool, and then an uncorrected memory error hits the page just
after entering __free_one_page(), then VM_BUG_ON_PAGE(page->flags &
PAGE_FLAGS_CHECK_AT_PREP) is triggered, despite the fact that it's not
necessary because the data on the affected page is not consumed.

To solve it, this patch drops __PG_HWPOISON from page flag checks at
allocation/free time.  I think it's justified because __PG_HWPOISON
flags is defined to prevent the page from being reused, and setting it
outside the page's alloc-free cycle is a designed behavior (not a bug.)

For recent months, I was annoyed about BUG_ON when soft-offlined page
remains on lru cache list for a while, which is avoided by calling
put_page() instead of putback_lru_page() in page migration's success
path.  This means that this patch reverts a major change from commit
add05cecef80 about the new refcounting rule of soft-offlined pages, so
"reuse window" revives.  This will be closed by a subsequent patch.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dean Nelson <dnelson@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 10 +++++++---
 mm/huge_memory.c           |  7 +------
 mm/migrate.c               |  5 ++++-
 mm/page_alloc.c            |  4 ++++
 4 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f34e040b34e9..41c93844fb1d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -631,15 +631,19 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 	 1 << PG_private | 1 << PG_private_2 | \
 	 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
+	 1 << PG_unevictable | __PG_MLOCKED | \
 	 __PG_COMPOUND_LOCK)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
- * Pages being prepped should not have any flags set.  It they are set,
+ * Pages being prepped should not have these flags set.  It they are set,
  * there has been a kernel bug or struct page corruption.
+ *
+ * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
+ * alloc-free cycle to prevent from reusing the page.
  */
-#define PAGE_FLAGS_CHECK_AT_PREP	((1 << NR_PAGEFLAGS) - 1)
+#define PAGE_FLAGS_CHECK_AT_PREP	\
+	(((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
 
 #define PAGE_FLAGS_PRIVATE				\
 	(1 << PG_private | 1 << PG_private_2)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c107094f79ba..097c7a4bfbd9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1676,12 +1676,7 @@ static void __split_huge_page_refcount(struct page *page,
 		/* after clearing PageTail the gup refcount can be released */
 		smp_mb__after_atomic();
 
-		/*
-		 * retain hwpoison flag of the poisoned tail page:
-		 *   fix for the unsuitable process killed on Guest Machine(KVM)
-		 *   by the memory-failure.
-		 */
-		page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+		page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 		page_tail->flags |= (page->flags &
 				     ((1L << PG_referenced) |
 				      (1L << PG_swapbacked) |
diff --git a/mm/migrate.c b/mm/migrate.c
index ee401e4e5ef1..f2415be7d93b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -950,7 +950,10 @@ out:
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
-		if (reason != MR_MEMORY_FAILURE)
+		/* Soft-offlined page shouldn't go through lru cache list */
+		if (reason == MR_MEMORY_FAILURE)
+			put_page(page);
+		else
 			putback_lru_page(page);
 	}
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb61f44eb3fc..beda41710802 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1296,6 +1296,10 @@ static inline int check_new_page(struct page *page)
 		bad_reason = "non-NULL mapping";
 	if (unlikely(atomic_read(&page->_count) != 0))
 		bad_reason = "nonzero _count";
+	if (unlikely(page->flags & __PG_HWPOISON)) {
+		bad_reason = "HWPoisoned (hardware-corrupted)";
+		bad_flags = __PG_HWPOISON;
+	}
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
-- 
cgit v1.2.3-71-gd317


From 209e4dbc8dcdb2b1839f18fd1cf07ec7bedadf4d Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 7 Aug 2015 12:31:17 +0200
Subject: drm/vblank: Use u32 consistently for vblank counters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In

commit 99264a61dfcda41d86d0960cf2d4c0fc2758a773
Author: Daniel Vetter <daniel.vetter@ffwll.ch>
Date:   Wed Apr 15 19:34:43 2015 +0200

    drm/vblank: Fixup and document timestamp update/read barriers

I've switched vblank->count from atomic_t to unsigned long and
accidentally created an integer comparison bug in
drm_vblank_count_and_time since vblanke->count might overflow the u32
local copy and hence the retry loop never succeed.

Fix this by consistently using u32.

Cc: Michel Dänzer <michel@daenzer.net>
Reported-by: Michel Dänzer <michel@daenzer.net>
Reviewed-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
---
 drivers/gpu/drm/drm_irq.c | 2 +-
 include/drm/drmP.h        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c
index f9cc68fbd2a3..b50fa0afd907 100644
--- a/drivers/gpu/drm/drm_irq.c
+++ b/drivers/gpu/drm/drm_irq.c
@@ -75,7 +75,7 @@ module_param_named(timestamp_precision_usec, drm_timestamp_precision, int, 0600)
 module_param_named(timestamp_monotonic, drm_timestamp_monotonic, int, 0600);
 
 static void store_vblank(struct drm_device *dev, int crtc,
-			 unsigned vblank_count_inc,
+			 u32 vblank_count_inc,
 			 struct timeval *t_vblank)
 {
 	struct drm_vblank_crtc *vblank = &dev->vblank[crtc];
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 48db6a56975f..5aa519711e0b 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -691,7 +691,7 @@ struct drm_vblank_crtc {
 	struct timer_list disable_timer;		/* delayed disable timer */
 
 	/* vblank counter, protected by dev->vblank_time_lock for writes */
-	unsigned long count;
+	u32 count;
 	/* vblank timestamps, protected by dev->vblank_time_lock for writes */
 	struct timeval time[DRM_VBLANKTIME_RBSIZE];
 
-- 
cgit v1.2.3-71-gd317


From 7a76a021cd5a292be875fbc616daf03eab1e6996 Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@suse.com>
Date: Fri, 7 Aug 2015 09:32:21 -0700
Subject: net-timestamp: Update skb_complete_tx_timestamp comment

After "62bccb8 net-timestamp: Make the clone operation stand-alone from phy
timestamping" the hwtstamps parameter of skb_complete_tx_timestamp() may no
longer be NULL.

Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Cc: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d6cdd6e87d53..22b6d9ca1654 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2884,11 +2884,11 @@ static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
  *
  * PHY drivers may accept clones of transmitted packets for
  * timestamping via their phy_driver.txtstamp method. These drivers
- * must call this function to return the skb back to the stack, with
- * or without a timestamp.
+ * must call this function to return the skb back to the stack with a
+ * timestamp.
  *
  * @skb: clone of the the original outgoing packet
- * @hwtstamps: hardware time stamps, may be NULL if not available
+ * @hwtstamps: hardware time stamps
  *
  */
 void skb_complete_tx_timestamp(struct sk_buff *skb,
-- 
cgit v1.2.3-71-gd317


From e037239e5e7b61007763984aa35a8329596d8c88 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Mon, 10 Aug 2015 15:28:49 -0400
Subject: drm/radeon: add new OLAND pci id

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
---
 include/drm/drm_pciids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h
index 45c39a37f924..8bc073d297db 100644
--- a/include/drm/drm_pciids.h
+++ b/include/drm/drm_pciids.h
@@ -172,6 +172,7 @@
 	{0x1002, 0x6610, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6611, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6613, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_NEW_MEMMAP}, \
+	{0x1002, 0x6617, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6620, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6621, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \
 	{0x1002, 0x6623, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_OLAND|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \
-- 
cgit v1.2.3-71-gd317