From e69f61862ab833e9b8d3c15b6ce07fd69f3bfecc Mon Sep 17 00:00:00 2001
From: Yacine Belkadi <yacine.belkadi.1@gmail.com>
Date: Fri, 12 Jul 2013 20:45:47 +0200
Subject: sched: Fix some kernel-doc warnings

When building the htmldocs (in verbose mode), scripts/kernel-doc
reports the follwing type of warnings:

  Warning(kernel/sched/core.c:936): No description found for return value of 'task_curr'
  ...

Fix those by:

 - adding the missing descriptions
 - using "Return" sections for the descriptions

Signed-off-by: Yacine Belkadi <yacine.belkadi.1@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1373654747-2389-1-git-send-email-yacine.belkadi.1@gmail.com
[ While at it, fix the cpupri_set() explanation. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 50d04b92ceda..82300247974c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1532,6 +1532,8 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
  * Test if a process is not yet dead (at most zombie state)
  * If pid_alive fails, then pointers within the task structure
  * can be stale and must not be dereferenced.
+ *
+ * Return: 1 if the process is alive. 0 otherwise.
  */
 static inline int pid_alive(struct task_struct *p)
 {
@@ -1543,6 +1545,8 @@ static inline int pid_alive(struct task_struct *p)
  * @tsk: Task structure to be checked.
  *
  * Check if a task structure is the first user space task the kernel created.
+ *
+ * Return: 1 if the task structure is init. 0 otherwise.
  */
 static inline int is_global_init(struct task_struct *tsk)
 {
@@ -1893,6 +1897,8 @@ extern struct task_struct *idle_task(int cpu);
 /**
  * is_idle_task - is the specified task an idle task?
  * @p: the task in question.
+ *
+ * Return: 1 if @p is an idle task. 0 otherwise.
  */
 static inline bool is_idle_task(const struct task_struct *p)
 {
-- 
cgit v1.2.3-71-gd317


From 7d46daba8dd5df1aa45724518a041ef7163d3ad5 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@dev.mellanox.co.il>
Date: Mon, 5 Aug 2013 16:05:32 +0300
Subject: mlx5: remove health handler plugin

Remove this code, per Dave Miller's request, since it is not being used
anywhere in the kernel.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 29 +-----------------------
 include/linux/mlx5/driver.h                      |  3 ---
 2 files changed, 1 insertion(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 748f10a155c4..3e6670c4a7cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -55,33 +55,9 @@ enum {
 };
 
 static DEFINE_SPINLOCK(health_lock);
-
 static LIST_HEAD(health_list);
 static struct work_struct health_work;
 
-static health_handler_t reg_handler;
-int mlx5_register_health_report_handler(health_handler_t handler)
-{
-	spin_lock_irq(&health_lock);
-	if (reg_handler) {
-		spin_unlock_irq(&health_lock);
-		return -EEXIST;
-	}
-	reg_handler = handler;
-	spin_unlock_irq(&health_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(mlx5_register_health_report_handler);
-
-void mlx5_unregister_health_report_handler(void)
-{
-	spin_lock_irq(&health_lock);
-	reg_handler = NULL;
-	spin_unlock_irq(&health_lock);
-}
-EXPORT_SYMBOL(mlx5_unregister_health_report_handler);
-
 static void health_care(struct work_struct *work)
 {
 	struct mlx5_core_health *health, *n;
@@ -98,11 +74,8 @@ static void health_care(struct work_struct *work)
 		priv = container_of(health, struct mlx5_priv, health);
 		dev = container_of(priv, struct mlx5_core_dev, priv);
 		mlx5_core_warn(dev, "handling bad device here\n");
+		/* nothing yet */
 		spin_lock_irq(&health_lock);
-		if (reg_handler)
-			reg_handler(dev->pdev, health->health,
-				    sizeof(health->health));
-
 		list_del_init(&health->list);
 		spin_unlock_irq(&health_lock);
 	}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2aa258b0ced1..611e65e76b00 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -731,9 +731,6 @@ void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
 int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db);
 void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db);
 
-typedef void (*health_handler_t)(struct pci_dev *pdev, struct health_buffer __iomem *buf, int size);
-int mlx5_register_health_report_handler(health_handler_t handler);
-void mlx5_unregister_health_report_handler(void);
 const char *mlx5_command_str(int command);
 int mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev);
 void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3-71-gd317


From e0acd0a68ec7dbf6b7a81a87a867ebd7ac9b76c4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 12 Aug 2013 18:14:00 +0200
Subject: sched: fix the theoretical signal_wake_up() vs schedule() race

This is only theoretical, but after try_to_wake_up(p) was changed
to check p->state under p->pi_lock the code like

	__set_current_state(TASK_INTERRUPTIBLE);
	schedule();

can miss a signal. This is the special case of wait-for-condition,
it relies on try_to_wake_up/schedule interaction and thus it does
not need mb() between __set_current_state() and if(signal_pending).

However, this __set_current_state() can move into the critical
section protected by rq->lock, now that try_to_wake_up() takes
another lock we need to ensure that it can't be reordered with
"if (signal_pending(current))" check inside that section.

The patch is actually one-liner, it simply adds smp_wmb() before
spin_lock_irq(rq->lock). This is what try_to_wake_up() already
does by the same reason.

We turn this wmb() into the new helper, smp_mb__before_spinlock(),
for better documentation and to allow the architectures to change
the default implementation.

While at it, kill smp_mb__after_lock(), it has no callers.

Perhaps we can also add smp_mb__before/after_spinunlock() for
prepare_to_wait().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/spinlock.h |  4 ----
 include/linux/spinlock.h        | 14 +++++++++++---
 kernel/sched/core.c             | 14 +++++++++++++-
 3 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 33692eaabab5..e3ddd7db723f 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -233,8 +233,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()
 
-/* The {read|write|spin}_lock() on x86 are full memory barriers. */
-static inline void smp_mb__after_lock(void) { }
-#define ARCH_HAS_SMP_MB_AFTER_LOCK
-
 #endif /* _ASM_X86_SPINLOCK_H */
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 7d537ced949a..75f34949d9ab 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -117,9 +117,17 @@ do {								\
 #endif /*arch_spin_is_contended*/
 #endif
 
-/* The lock does not imply full memory barrier. */
-#ifndef ARCH_HAS_SMP_MB_AFTER_LOCK
-static inline void smp_mb__after_lock(void) { smp_mb(); }
+/*
+ * Despite its name it doesn't necessarily has to be a full barrier.
+ * It should only guarantee that a STORE before the critical section
+ * can not be reordered with a LOAD inside this section.
+ * spin_lock() is the one-way barrier, this LOAD can not escape out
+ * of the region. So the default implementation simply ensures that
+ * a STORE can not move into the critical section, smp_wmb() should
+ * serialize it with another STORE done by spin_lock().
+ */
+#ifndef smp_mb__before_spinlock
+#define smp_mb__before_spinlock()	smp_wmb()
 #endif
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7c32cb7bfeb..ef51b0ef4bdc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1491,7 +1491,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	unsigned long flags;
 	int cpu, success = 0;
 
-	smp_wmb();
+	/*
+	 * If we are going to wake up a thread waiting for CONDITION we
+	 * need to ensure that CONDITION=1 done by the caller can not be
+	 * reordered with p->state check below. This pairs with mb() in
+	 * set_current_state() the waiting thread does.
+	 */
+	smp_mb__before_spinlock();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	if (!(p->state & state))
 		goto out;
@@ -2394,6 +2400,12 @@ need_resched:
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 
+	/*
+	 * Make sure that signal_pending_state()->signal_pending() below
+	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+	 * done by the caller to avoid the race with signal_wake_up().
+	 */
+	smp_mb__before_spinlock();
 	raw_spin_lock_irq(&rq->lock);
 
 	switch_count = &prev->nivcsw;
-- 
cgit v1.2.3-71-gd317


From 179ef71cbc085252e3fe6b8159263a7ed1d88ea4 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 13 Aug 2013 16:00:49 -0700
Subject: mm: save soft-dirty bits on swapped pages

Andy Lutomirski reported that if a page with _PAGE_SOFT_DIRTY bit set
get swapped out, the bit is getting lost and no longer available when
pte read back.

To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is saved in
pte entry for the page being swapped out.  When such page is to be read
back from a swap cache we check for bit presence and if it's there we
clear it and restore the former _PAGE_SOFT_DIRTY bit back.

One of the problem was to find a place in pte entry where we can save
the _PTE_SWP_SOFT_DIRTY bit while page is in swap.  The _PAGE_PSE was
chosen for that, it doesn't intersect with swap entry format stored in
pte.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h       | 15 +++++++++++++++
 arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++
 fs/proc/task_mmu.c                   | 21 +++++++++++++++------
 include/asm-generic/pgtable.h        | 15 +++++++++++++++
 include/linux/swapops.h              |  2 ++
 mm/memory.c                          |  2 ++
 mm/rmap.c                            |  6 +++++-
 mm/swapfile.c                        | 19 +++++++++++++++++--
 8 files changed, 84 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 7dc305a46058..bd0518a7f197 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pte_swp_soft_dirty(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+
 /*
  * Mask out unsupported bits in a present pgprot.  Non-present pgprots
  * can use those bits for other purposes, so leave them be.
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index c98ac63aae48..5e8442f178f9 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -67,6 +67,19 @@
 #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 
+/*
+ * Tracking soft dirty bit when a page goes to a swap is tricky.
+ * We need a bit which can be stored in pte _and_ not conflict
+ * with swap entry format. On x86 bits 6 and 7 are *not* involved
+ * into swap entry computation, but bit 6 is used for nonlinear
+ * file mapping, so we borrow bit 7 for soft dirty tracking.
+ */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SWP_SOFT_DIRTY	_PAGE_PSE
+#else
+#define _PAGE_SWP_SOFT_DIRTY	(_AT(pteval_t, 0))
+#endif
+
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #else
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dbf61f6174f0..e2d9bdce5e7e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 	 * of how soft-dirty works.
 	 */
 	pte_t ptent = *pte;
-	ptent = pte_wrprotect(ptent);
-	ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+
+	if (pte_present(ptent)) {
+		ptent = pte_wrprotect(ptent);
+		ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+	} else if (is_swap_pte(ptent)) {
+		ptent = pte_swp_clear_soft_dirty(ptent);
+	}
+
 	set_pte_at(vma->vm_mm, addr, pte, ptent);
 #endif
 }
@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
 		ptent = *pte;
-		if (!pte_present(ptent))
-			continue;
 
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
 			clear_soft_dirty(vma, addr, pte);
 			continue;
 		}
 
+		if (!pte_present(ptent))
+			continue;
+
 		page = vm_normal_page(vma, addr, ptent);
 		if (!page)
 			continue;
@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 		flags = PM_PRESENT;
 		page = vm_normal_page(vma, addr, pte);
 	} else if (is_swap_pte(pte)) {
-		swp_entry_t entry = pte_to_swp_entry(pte);
-
+		swp_entry_t entry;
+		if (pte_swp_soft_dirty(pte))
+			flags2 |= __PM_SOFT_DIRTY;
+		entry = pte_to_swp_entry(pte);
 		frame = swp_type(entry) |
 			(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
 		flags = PM_SWAP;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 2f47ade1b567..2a7e0d10ad9a 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 {
 	return pmd;
 }
+
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+	return pte;
+}
+
+static inline int pte_swp_soft_dirty(pte_t pte)
+{
+	return 0;
+}
+
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+	return pte;
+}
 #endif
 
 #ifndef __HAVE_PFNMAP_TRACKING
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index c5fd30d2a415..8d4fa82bfb91 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
 	swp_entry_t arch_entry;
 
 	BUG_ON(pte_file(pte));
+	if (pte_swp_soft_dirty(pte))
+		pte = pte_swp_clear_soft_dirty(pte);
 	arch_entry = __pte_to_swp_entry(pte);
 	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
 }
diff --git a/mm/memory.c b/mm/memory.c
index 1ce2e2a734fc..e98ecad2b9c8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		exclusive = 1;
 	}
 	flush_icache_page(vma, page);
+	if (pte_swp_soft_dirty(orig_pte))
+		pte = pte_mksoft_dirty(pte);
 	set_pte_at(mm, address, page_table, pte);
 	if (page == swapcache)
 		do_page_add_anon_rmap(page, vma, address, exclusive);
diff --git a/mm/rmap.c b/mm/rmap.c
index cd356df4f71a..83325b80142b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			   swp_entry_to_pte(make_hwpoison_entry(page)));
 	} else if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page_private(page) };
+		pte_t swp_pte;
 
 		if (PageSwapCache(page)) {
 			/*
@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
 			entry = make_migration_entry(page, pte_write(pteval));
 		}
-		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+		swp_pte = swp_entry_to_pte(entry);
+		if (pte_soft_dirty(pteval))
+			swp_pte = pte_swp_mksoft_dirty(swp_pte);
+		set_pte_at(mm, address, pte, swp_pte);
 		BUG_ON(pte_file(*pte));
 	} else if (IS_ENABLED(CONFIG_MIGRATION) &&
 		   (TTU_ACTION(flags) == TTU_MIGRATION)) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 36af6eeaa67e..6cf2e60983b7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, int free)
 }
 #endif /* CONFIG_HIBERNATION */
 
+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+	/*
+	 * When pte keeps soft dirty bit the pte generated
+	 * from swap entry does not has it, still it's same
+	 * pte from logical point of view.
+	 */
+	pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
+	return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
+#else
+	return pte_same(pte, swp_pte);
+#endif
+}
+
 /*
  * No need to decide whether this PTE shares the swap entry with others,
  * just let do_wp_page work it out if a write is requested later - to
@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	}
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
+	if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
 		mem_cgroup_cancel_charge_swapin(memcg);
 		ret = 0;
 		goto out;
@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		 * swapoff spends a _lot_ of time in this loop!
 		 * Test inline before going to call unuse_pte.
 		 */
-		if (unlikely(pte_same(*pte, swp_pte))) {
+		if (unlikely(maybe_same_pte(*pte, swp_pte))) {
 			pte_unmap(pte);
 			ret = unuse_pte(vma, pmd, addr, entry, page);
 			if (ret)
-- 
cgit v1.2.3-71-gd317


From dfa9771a7c4784bafd0673bc7abcee3813088b77 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Tue, 13 Aug 2013 16:00:53 -0700
Subject: microblaze: fix clone syscall

Fix inadvertent breakage in the clone syscall ABI for Microblaze that
was introduced in commit f3268edbe6fe ("microblaze: switch to generic
fork/vfork/clone").

The Microblaze syscall ABI for clone takes the parent tid address in the
4th argument; the third argument slot is used for the stack size.  The
incorrectly-used CLONE_BACKWARDS type assigned parent tid to the 3rd
slot.

This commit restores the original ABI so that existing userspace libc
code will work correctly.

All kernel versions from v3.8-rc1 were affected.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig             | 6 ++++++
 arch/microblaze/Kconfig  | 2 +-
 include/linux/syscalls.h | 5 +++++
 kernel/fork.c            | 6 ++++++
 4 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 8d2ae24b9f4a..1feb169274fe 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -407,6 +407,12 @@ config CLONE_BACKWARDS2
 	help
 	  Architecture has the first two arguments of clone(2) swapped.
 
+config CLONE_BACKWARDS3
+	bool
+	help
+	  Architecture has tls passed as the 3rd argument of clone(2),
+	  not the 5th one.
+
 config ODD_RT_SIGACTION
 	bool
 	help
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index d22a4ecffff4..4fab52294d98 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -28,7 +28,7 @@ config MICROBLAZE
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_IDLE_POLL_SETUP
 	select MODULES_USE_ELF_RELA
-	select CLONE_BACKWARDS
+	select CLONE_BACKWARDS3
 
 config SWAP
 	def_bool n
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 4147d700a293..84662ecc7b51 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -802,9 +802,14 @@ asmlinkage long sys_vfork(void);
 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int,
 	       int __user *);
 #else
+#ifdef CONFIG_CLONE_BACKWARDS3
+asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *,
+			  int __user *, int);
+#else
 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
 	       int __user *, int);
 #endif
+#endif
 
 asmlinkage long sys_execve(const char __user *filename,
 		const char __user *const __user *argv,
diff --git a/kernel/fork.c b/kernel/fork.c
index 403d2bb8a968..e23bb19e2a3e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1679,6 +1679,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
 		 int __user *, parent_tidptr,
 		 int __user *, child_tidptr,
 		 int, tls_val)
+#elif defined(CONFIG_CLONE_BACKWARDS3)
+SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
+		int, stack_size,
+		int __user *, parent_tidptr,
+		int __user *, child_tidptr,
+		int, tls_val)
 #else
 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		 int __user *, parent_tidptr,
-- 
cgit v1.2.3-71-gd317


From df54d6fa54275ce59660453e29d1228c2b45a826 Mon Sep 17 00:00:00 2001
From: Radu Caragea <sinaelgl@gmail.com>
Date: Tue, 13 Aug 2013 16:00:59 -0700
Subject: x86 get_unmapped_area(): use proper mmap base for bottom-up direction

When the stack is set to unlimited, the bottomup direction is used for
mmap-ings but the mmap_base is not used and thus effectively renders
ASLR for mmapings along with PIE useless.

Cc: Michel Lespinasse <walken@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Sendroiu <molecula2788@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/sys_x86_64.c | 2 +-
 arch/x86/mm/mmap.c           | 2 +-
 include/linux/sched.h        | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index dbded5aedb81..48f8375e4c6b 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 				*begin = new_begin;
 		}
 	} else {
-		*begin = TASK_UNMAPPED_BASE;
+		*begin = mmap_legacy_base();
 		*end = TASK_SIZE;
 	}
 }
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 62c29a5bfe26..f63778cb2363 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -98,7 +98,7 @@ static unsigned long mmap_base(void)
  * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
  * does, but not when emulating X86_32
  */
-static unsigned long mmap_legacy_base(void)
+unsigned long mmap_legacy_base(void)
 {
 	if (mmap_is_ia32())
 		return TASK_UNMAPPED_BASE;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d722490da030..923dd6ea4a0e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -314,6 +314,7 @@ struct nsproxy;
 struct user_namespace;
 
 #ifdef CONFIG_MMU
+extern unsigned long mmap_legacy_base(void);
 extern void arch_pick_mmap_layout(struct mm_struct *mm);
 extern unsigned long
 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
-- 
cgit v1.2.3-71-gd317


From 3f0fa9a808f98fa10a18ba2a73f13d65fda990fb Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@linaro.org>
Date: Wed, 14 Aug 2013 16:05:02 -0700
Subject: regmap: Add another missing header for !CONFIG_REGMAP stubs

The use of WARN_ON() needs the definitions from bug.h, without it
you can get:

include/linux/regmap.h: In function 'regmap_write':
include/linux/regmap.h:525:2: error: implicit declaration of function 'WARN_ONCE' [-Werror=implicit-function-declaration]

Signed-off-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Mark Brown <broonie@linaro.org>
Cc: stable@vger.kernel.org
---
 include/linux/regmap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 580a5320cc96..6d91fcb4c5cb 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -16,6 +16,7 @@
 #include <linux/list.h>
 #include <linux/rbtree.h>
 #include <linux/err.h>
+#include <linux/bug.h>
 
 struct module;
 struct device;
-- 
cgit v1.2.3-71-gd317


From 0a324f3189ed9c78b1aaf48d88e93cb18643c655 Mon Sep 17 00:00:00 2001
From: Moshe Lazer <moshel@mellanox.com>
Date: Wed, 14 Aug 2013 17:46:48 +0300
Subject: net/mlx5_core: Support MANAGE_PAGES and QUERY_PAGES firmware command
 changes

In the previous QUERY_PAGES command version we used one command to get the
required amount of boot, init and post init pages.  The new version uses the
op_mod field to specify whether the query is for the required amount of boot,
init or post init pages. In addition the output field size for the required
amount of pages increased from 16 to 32 bits.

In MANAGE_PAGES command the input_num_entries and output_num_entries fields
sizes changed from 16 to 32 bits and the PAS tables offset changed to 0x10.

In the pages request event the num_pages field also changed to 32 bits.

In the HCA-capabilities-layout the size and location of max_qp_mcg field has
been changed to support 24 bits.

This patch isn't compatible with firmware versions < 5; however, it  turns out that the
first GA firmware we will publish will not support previous versions so this should be OK.

Signed-off-by: Moshe Lazer <moshel@mellanox.com>
Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fw.c       |  2 +-
 .../net/ethernet/mellanox/mlx5/core/pagealloc.c    | 58 ++++++++++------------
 include/linux/mlx5/device.h                        | 22 ++++----
 include/linux/mlx5/driver.h                        |  4 +-
 6 files changed, 41 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index c571de85d0f9..5472cbd34028 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -46,7 +46,7 @@
 #include "mlx5_core.h"
 
 enum {
-	CMD_IF_REV = 4,
+	CMD_IF_REV = 5,
 };
 
 enum {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index c02cbcfd0fb8..443cc4d7b024 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -268,7 +268,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 		case MLX5_EVENT_TYPE_PAGE_REQUEST:
 			{
 				u16 func_id = be16_to_cpu(eqe->data.req_pages.func_id);
-				s16 npages = be16_to_cpu(eqe->data.req_pages.num_pages);
+				s32 npages = be32_to_cpu(eqe->data.req_pages.num_pages);
 
 				mlx5_core_dbg(dev, "page request for func 0x%x, napges %d\n", func_id, npages);
 				mlx5_core_req_pages_handler(dev, func_id, npages);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 72a5222447f5..f012658b6a92 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -113,7 +113,7 @@ int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev,
 	caps->log_max_srq = out->hca_cap.log_max_srqs & 0x1f;
 	caps->local_ca_ack_delay = out->hca_cap.local_ca_ack_delay & 0x1f;
 	caps->log_max_mcg = out->hca_cap.log_max_mcg;
-	caps->max_qp_mcg = be16_to_cpu(out->hca_cap.max_qp_mcg);
+	caps->max_qp_mcg = be32_to_cpu(out->hca_cap.max_qp_mcg) & 0xffffff;
 	caps->max_ra_res_qp = 1 << (out->hca_cap.log_max_ra_res_qp & 0x3f);
 	caps->max_ra_req_qp = 1 << (out->hca_cap.log_max_ra_req_qp & 0x3f);
 	caps->max_srq_wqes = 1 << out->hca_cap.log_max_srq_sz;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
index 4a3e137931a3..3a2408d44820 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -43,10 +43,16 @@ enum {
 	MLX5_PAGES_TAKE		= 2
 };
 
+enum {
+	MLX5_BOOT_PAGES		= 1,
+	MLX5_INIT_PAGES		= 2,
+	MLX5_POST_INIT_PAGES	= 3
+};
+
 struct mlx5_pages_req {
 	struct mlx5_core_dev *dev;
 	u32	func_id;
-	s16	npages;
+	s32	npages;
 	struct work_struct work;
 };
 
@@ -64,27 +70,23 @@ struct mlx5_query_pages_inbox {
 
 struct mlx5_query_pages_outbox {
 	struct mlx5_outbox_hdr	hdr;
-	__be16			num_boot_pages;
+	__be16			rsvd;
 	__be16			func_id;
-	__be16			init_pages;
-	__be16			num_pages;
+	__be32			num_pages;
 };
 
 struct mlx5_manage_pages_inbox {
 	struct mlx5_inbox_hdr	hdr;
-	__be16			rsvd0;
+	__be16			rsvd;
 	__be16			func_id;
-	__be16			rsvd1;
-	__be16			num_entries;
-	u8			rsvd2[16];
+	__be32			num_entries;
 	__be64			pas[0];
 };
 
 struct mlx5_manage_pages_outbox {
 	struct mlx5_outbox_hdr	hdr;
-	u8			rsvd0[2];
-	__be16			num_entries;
-	u8			rsvd1[20];
+	__be32			num_entries;
+	u8			rsvd[4];
 	__be64			pas[0];
 };
 
@@ -146,7 +148,7 @@ static struct page *remove_page(struct mlx5_core_dev *dev, u64 addr)
 }
 
 static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id,
-				s16 *pages, s16 *init_pages, u16 *boot_pages)
+				s32 *npages, int boot)
 {
 	struct mlx5_query_pages_inbox	in;
 	struct mlx5_query_pages_outbox	out;
@@ -155,6 +157,8 @@ static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id,
 	memset(&in, 0, sizeof(in));
 	memset(&out, 0, sizeof(out));
 	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_PAGES);
+	in.hdr.opmod = boot ? cpu_to_be16(MLX5_BOOT_PAGES) : cpu_to_be16(MLX5_INIT_PAGES);
+
 	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
 	if (err)
 		return err;
@@ -162,15 +166,7 @@ static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id,
 	if (out.hdr.status)
 		return mlx5_cmd_status_to_err(&out.hdr);
 
-	if (pages)
-		*pages = be16_to_cpu(out.num_pages);
-
-	if (init_pages)
-		*init_pages = be16_to_cpu(out.init_pages);
-
-	if (boot_pages)
-		*boot_pages = be16_to_cpu(out.num_boot_pages);
-
+	*npages = be32_to_cpu(out.num_pages);
 	*func_id = be16_to_cpu(out.func_id);
 
 	return err;
@@ -224,7 +220,7 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages,
 	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
 	in->hdr.opmod = cpu_to_be16(MLX5_PAGES_GIVE);
 	in->func_id = cpu_to_be16(func_id);
-	in->num_entries = cpu_to_be16(npages);
+	in->num_entries = cpu_to_be32(npages);
 	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
 	mlx5_core_dbg(dev, "err %d\n", err);
 	if (err) {
@@ -292,7 +288,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
 	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
 	in.hdr.opmod = cpu_to_be16(MLX5_PAGES_TAKE);
 	in.func_id = cpu_to_be16(func_id);
-	in.num_entries = cpu_to_be16(npages);
+	in.num_entries = cpu_to_be32(npages);
 	mlx5_core_dbg(dev, "npages %d, outlen %d\n", npages, outlen);
 	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen);
 	if (err) {
@@ -306,7 +302,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
 		goto out_free;
 	}
 
-	num_claimed = be16_to_cpu(out->num_entries);
+	num_claimed = be32_to_cpu(out->num_entries);
 	if (nclaimed)
 		*nclaimed = num_claimed;
 
@@ -345,7 +341,7 @@ static void pages_work_handler(struct work_struct *work)
 }
 
 void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
-				 s16 npages)
+				 s32 npages)
 {
 	struct mlx5_pages_req *req;
 
@@ -364,20 +360,18 @@ void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
 
 int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot)
 {
-	u16 uninitialized_var(boot_pages);
-	s16 uninitialized_var(init_pages);
 	u16 uninitialized_var(func_id);
+	s32 uninitialized_var(npages);
 	int err;
 
-	err = mlx5_cmd_query_pages(dev, &func_id, NULL, &init_pages,
-				   &boot_pages);
+	err = mlx5_cmd_query_pages(dev, &func_id, &npages, boot);
 	if (err)
 		return err;
 
+	mlx5_core_dbg(dev, "requested %d %s pages for func_id 0x%x\n",
+		      npages, boot ? "boot" : "init", func_id);
 
-	mlx5_core_dbg(dev, "requested %d init pages and %d boot pages for func_id 0x%x\n",
-		      init_pages, boot_pages, func_id);
-	return give_pages(dev, func_id, boot ? boot_pages : init_pages, 0);
+	return give_pages(dev, func_id, npages, 0);
 }
 
 static int optimal_reclaimed_pages(void)
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 737685e9e852..68029b30c3dc 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -309,21 +309,20 @@ struct mlx5_hca_cap {
 	__be16	max_desc_sz_rq;
 	u8	rsvd21[2];
 	__be16	max_desc_sz_sq_dc;
-	u8	rsvd22[4];
-	__be16	max_qp_mcg;
-	u8	rsvd23;
+	__be32	max_qp_mcg;
+	u8	rsvd22[3];
 	u8	log_max_mcg;
-	u8	rsvd24;
+	u8	rsvd23;
 	u8	log_max_pd;
-	u8	rsvd25;
+	u8	rsvd24;
 	u8	log_max_xrcd;
-	u8	rsvd26[42];
+	u8	rsvd25[42];
 	__be16  log_uar_page_sz;
-	u8	rsvd27[28];
+	u8	rsvd26[28];
 	u8	log_msx_atomic_size_qp;
-	u8	rsvd28[2];
+	u8	rsvd27[2];
 	u8	log_msx_atomic_size_dc;
-	u8	rsvd29[76];
+	u8	rsvd28[76];
 };
 
 
@@ -472,9 +471,8 @@ struct mlx5_eqe_cmd {
 struct mlx5_eqe_page_req {
 	u8		rsvd0[2];
 	__be16		func_id;
-	u8		rsvd1[2];
-	__be16		num_pages;
-	__be32		rsvd2[5];
+	__be32		num_pages;
+	__be32		rsvd1[5];
 };
 
 union ev_data {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 611e65e76b00..8888381fc150 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -358,7 +358,7 @@ struct mlx5_caps {
 	u32	reserved_lkey;
 	u8	local_ca_ack_delay;
 	u8	log_max_mcg;
-	u16	max_qp_mcg;
+	u32	max_qp_mcg;
 	int	min_page_sz;
 };
 
@@ -691,7 +691,7 @@ void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev);
 int mlx5_pagealloc_start(struct mlx5_core_dev *dev);
 void mlx5_pagealloc_stop(struct mlx5_core_dev *dev);
 void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
-				 s16 npages);
+				 s32 npages);
 int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot);
 int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev);
 void mlx5_register_debugfs(void);
-- 
cgit v1.2.3-71-gd317


From f46078cfcd77fa5165bf849f5e568a7ac5fa569c Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Fri, 16 Aug 2013 13:30:07 +0200
Subject: ipv6: drop packets with multiple fragmentation headers

It is not allowed for an ipv6 packet to contain multiple fragmentation
headers. So discard packets which were already reassembled by
fragmentation logic and send back a parameter problem icmp.

The updates for RFC 6980 will come in later, I have to do a bit more
research here.

Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h  | 1 +
 net/ipv6/reassembly.c | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 850e95bc766c..b8b7dc755752 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -101,6 +101,7 @@ struct inet6_skb_parm {
 #define IP6SKB_FORWARDED	2
 #define IP6SKB_REROUTED		4
 #define IP6SKB_ROUTERALERT	8
+#define IP6SKB_FRAGMENTED      16
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 790d9f4b8b0b..1aeb473b2cc6 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -490,6 +490,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	ipv6_hdr(head)->payload_len = htons(payload_len);
 	ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
 	IP6CB(head)->nhoff = nhoff;
+	IP6CB(head)->flags |= IP6SKB_FRAGMENTED;
 
 	/* Yes, and fold redundant checksum back. 8) */
 	if (head->ip_summed == CHECKSUM_COMPLETE)
@@ -524,6 +525,9 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	struct net *net = dev_net(skb_dst(skb)->dev);
 	int evicted;
 
+	if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
+		goto fail_hdr;
+
 	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS);
 
 	/* Jumbo payload inhibits frag. header */
@@ -544,6 +548,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 				 ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS);
 
 		IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb);
+		IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
 		return 1;
 	}
 
-- 
cgit v1.2.3-71-gd317


From d79ff142624e1be080ad8d09101f7004d79c36e1 Mon Sep 17 00:00:00 2001
From: Martin Peschke <mpeschke@linux.vnet.ibm.com>
Date: Thu, 22 Aug 2013 17:45:36 +0200
Subject: [SCSI] zfcp: fix lock imbalance by reworking request queue locking

This patch adds wait_event_interruptible_lock_irq_timeout(), which is a
straight-forward descendant of wait_event_interruptible_timeout() and
wait_event_interruptible_lock_irq().

The zfcp driver used to call wait_event_interruptible_timeout()
in combination with some intricate and error-prone locking. Using
wait_event_interruptible_lock_irq_timeout() as a replacement
nicely cleans up that locking.

This rework removes a situation that resulted in a locking imbalance
in zfcp_qdio_sbal_get():

BUG: workqueue leaked lock or atomic: events/1/0xffffff00/10
    last function: zfcp_fc_wka_port_offline+0x0/0xa0 [zfcp]

It was introduced by commit c2af7545aaff3495d9bf9a7608c52f0af86fb194
"[SCSI] zfcp: Do not wait for SBALs on stopped queue", which had a new
code path related to ZFCP_STATUS_ADAPTER_QDIOUP that took an early exit
without a required lock being held. The problem occured when a
special, non-SCSI I/O request was being submitted in process context,
when the adapter's queues had been torn down. In this case the bug
surfaced when the Fibre Channel port connection for a well-known address
was closed during a concurrent adapter shut-down procedure, which is a
rare constellation.

This patch also fixes these warnings from the sparse tool (make C=1):

drivers/s390/scsi/zfcp_qdio.c:224:12: warning: context imbalance in
 'zfcp_qdio_sbal_check' - wrong count at exit
drivers/s390/scsi/zfcp_qdio.c:244:5: warning: context imbalance in
 'zfcp_qdio_sbal_get' - unexpected unlock

Last but not least, we get rid of that crappy lock-unlock-lock
sequence at the beginning of the critical section.

It is okay to call zfcp_erp_adapter_reopen() with req_q_lock held.

Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Peschke <mpeschke@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org #2.6.35+
Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/s390/scsi/zfcp_qdio.c |  8 ++----
 include/linux/wait.h          | 57 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/s390/scsi/zfcp_qdio.c b/drivers/s390/scsi/zfcp_qdio.c
index 665e3cfaaf85..de0598eaacd2 100644
--- a/drivers/s390/scsi/zfcp_qdio.c
+++ b/drivers/s390/scsi/zfcp_qdio.c
@@ -224,11 +224,9 @@ int zfcp_qdio_sbals_from_sg(struct zfcp_qdio *qdio, struct zfcp_qdio_req *q_req,
 
 static int zfcp_qdio_sbal_check(struct zfcp_qdio *qdio)
 {
-	spin_lock_irq(&qdio->req_q_lock);
 	if (atomic_read(&qdio->req_q_free) ||
 	    !(atomic_read(&qdio->adapter->status) & ZFCP_STATUS_ADAPTER_QDIOUP))
 		return 1;
-	spin_unlock_irq(&qdio->req_q_lock);
 	return 0;
 }
 
@@ -246,9 +244,8 @@ int zfcp_qdio_sbal_get(struct zfcp_qdio *qdio)
 {
 	long ret;
 
-	spin_unlock_irq(&qdio->req_q_lock);
-	ret = wait_event_interruptible_timeout(qdio->req_q_wq,
-			       zfcp_qdio_sbal_check(qdio), 5 * HZ);
+	ret = wait_event_interruptible_lock_irq_timeout(qdio->req_q_wq,
+		       zfcp_qdio_sbal_check(qdio), qdio->req_q_lock, 5 * HZ);
 
 	if (!(atomic_read(&qdio->adapter->status) & ZFCP_STATUS_ADAPTER_QDIOUP))
 		return -EIO;
@@ -262,7 +259,6 @@ int zfcp_qdio_sbal_get(struct zfcp_qdio *qdio)
 		zfcp_erp_adapter_reopen(qdio->adapter, 0, "qdsbg_1");
 	}
 
-	spin_lock_irq(&qdio->req_q_lock);
 	return -EIO;
 }
 
diff --git a/include/linux/wait.h b/include/linux/wait.h
index f487a4750b7f..a67fc1635592 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -811,6 +811,63 @@ do {									\
 	__ret;								\
 })
 
+#define __wait_event_interruptible_lock_irq_timeout(wq, condition,	\
+						    lock, ret)		\
+do {									\
+	DEFINE_WAIT(__wait);						\
+									\
+	for (;;) {							\
+		prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);	\
+		if (condition)						\
+			break;						\
+		if (signal_pending(current)) {				\
+			ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		spin_unlock_irq(&lock);					\
+		ret = schedule_timeout(ret);				\
+		spin_lock_irq(&lock);					\
+		if (!ret)						\
+			break;						\
+	}								\
+	finish_wait(&wq, &__wait);					\
+} while (0)
+
+/**
+ * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets true or a timeout elapses.
+ *		The condition is checked under the lock. This is expected
+ *		to be called with the lock taken.
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @lock: a locked spinlock_t, which will be released before schedule()
+ *	  and reacquired afterwards.
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or signal is received. The @condition is
+ * checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * This is supposed to be called while holding the lock. The lock is
+ * dropped before going to sleep and is reacquired afterwards.
+ *
+ * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it
+ * was interrupted by a signal, and the remaining jiffies otherwise
+ * if the condition evaluated to true before the timeout elapsed.
+ */
+#define wait_event_interruptible_lock_irq_timeout(wq, condition, lock,	\
+						  timeout)		\
+({									\
+	int __ret = timeout;						\
+									\
+	if (!(condition))						\
+		__wait_event_interruptible_lock_irq_timeout(		\
+					wq, condition, lock, __ret);	\
+	__ret;								\
+})
+
 
 /*
  * These are the old interfaces to sleep waiting for an event.
-- 
cgit v1.2.3-71-gd317


From 5ea80f76a56605a190a7ea16846c82aa63dbd0aa Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 22 Aug 2013 09:13:06 -0700
Subject: Revert "x86 get_unmapped_area(): use proper mmap base for bottom-up
 direction"

This reverts commit df54d6fa54275ce59660453e29d1228c2b45a826.

The commit isn't necessarily wrong, but because it recalculates the
random mmap_base every time, it seems to confuse user memory allocators
that expect contiguous mmap allocations even when the mmap address isn't
specified.

In particular, the MATLAB Java runtime seems to be unhappy. See

  https://bugzilla.kernel.org/show_bug.cgi?id=60774

So we'll want to apply the random offset only once, and Radu has a patch
for that.  Revert this older commit in order to apply the other one.

Reported-by: Jeff Shorey <shoreyjeff@gmail.com>
Cc: Radu Caragea <sinaelgl@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/sys_x86_64.c | 2 +-
 arch/x86/mm/mmap.c           | 2 +-
 include/linux/sched.h        | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 48f8375e4c6b..dbded5aedb81 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 				*begin = new_begin;
 		}
 	} else {
-		*begin = mmap_legacy_base();
+		*begin = TASK_UNMAPPED_BASE;
 		*end = TASK_SIZE;
 	}
 }
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index f63778cb2363..62c29a5bfe26 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -98,7 +98,7 @@ static unsigned long mmap_base(void)
  * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
  * does, but not when emulating X86_32
  */
-unsigned long mmap_legacy_base(void)
+static unsigned long mmap_legacy_base(void)
 {
 	if (mmap_is_ia32())
 		return TASK_UNMAPPED_BASE;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e9995eb5985c..078066daffd4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -314,7 +314,6 @@ struct nsproxy;
 struct user_namespace;
 
 #ifdef CONFIG_MMU
-extern unsigned long mmap_legacy_base(void);
 extern void arch_pick_mmap_layout(struct mm_struct *mm);
 extern unsigned long
 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
-- 
cgit v1.2.3-71-gd317


From 41aacc1eea645c99edbe8fbcf78a97dc9b862adc Mon Sep 17 00:00:00 2001
From: Radu Caragea <sinaelgl@gmail.com>
Date: Wed, 21 Aug 2013 20:55:59 +0300
Subject: x86 get_unmapped_area: Access mmap_legacy_base through mm_struct
 member

This is the updated version of df54d6fa5427 ("x86 get_unmapped_area():
use proper mmap base for bottom-up direction") that only randomizes the
mmap base address once.

Signed-off-by: Radu Caragea <sinaelgl@gmail.com>
Reported-and-tested-by: Jeff Shorey <shoreyjeff@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Adrian Sendroiu <molecula2788@gmail.com>
Cc: Greg KH <greg@kroah.com>
Cc: Kamal Mostafa <kamal@canonical.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/sys_x86_64.c | 2 +-
 arch/x86/mm/mmap.c           | 6 ++++--
 include/linux/mm_types.h     | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index dbded5aedb81..30277e27431a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -101,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 				*begin = new_begin;
 		}
 	} else {
-		*begin = TASK_UNMAPPED_BASE;
+		*begin = current->mm->mmap_legacy_base;
 		*end = TASK_SIZE;
 	}
 }
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 62c29a5bfe26..25e7e1372bb2 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -112,11 +112,13 @@ static unsigned long mmap_legacy_base(void)
  */
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
+	mm->mmap_legacy_base = mmap_legacy_base();
+	mm->mmap_base = mmap_base();
+
 	if (mmap_is_legacy()) {
-		mm->mmap_base = mmap_legacy_base();
+		mm->mmap_base = mm->mmap_legacy_base;
 		mm->get_unmapped_area = arch_get_unmapped_area;
 	} else {
-		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 	}
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fb425aa16c01..faf4b7c1ad12 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -332,6 +332,7 @@ struct mm_struct {
 				unsigned long pgoff, unsigned long flags);
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
+	unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
 	unsigned long task_size;		/* size of task vm space */
 	unsigned long highest_vm_end;		/* highest vma end address */
 	pgd_t * pgd;
-- 
cgit v1.2.3-71-gd317


From 4a5a8aa6c966eafc106543bd955ae388230420e5 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Wed, 21 Aug 2013 21:09:47 -0700
Subject: ipv4: expose IPV4_DEVCONF

IP sends device configuration (see inet_fill_link_af) as an array
in the netlink information, but the indices in that array are not
exposed to userspace through any current santized header file.

It was available back in 2.6.32 (in /usr/include/linux/sysctl.h)
but was broken by:
  commit 02291680ffba92e5b5865bc0c5e7d1f3056b80ec
  Author: Eric W. Biederman <ebiederm@xmission.com>
  Date:   Sun Feb 14 03:25:51 2010 +0000

    net ipv4: Decouple ipv4 interface parameters from binary sysctl numbers

Eric was solving the sysctl problem but then the indices were re-exposed
by a later addition of devconf support for IPV4

  commit 9f0f7272ac9506f4c8c05cc597b7e376b0b9f3e4
  Author: Thomas Graf <tgraf@infradead.org>
  Date:   Tue Nov 16 04:32:48 2010 +0000

    ipv4: AF_INET link address family

Putting them in /usr/include/linux/ip.h seemed the logical match
for the DEVCONF_ definitions for IPV6 in /usr/include/linux/ip6.h

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h | 34 +---------------------------------
 include/uapi/linux/ip.h    | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index b99cd23f3474..79640e015a86 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -5,45 +5,13 @@
 
 #include <linux/bitmap.h>
 #include <linux/if.h>
+#include <linux/ip.h>
 #include <linux/netdevice.h>
 #include <linux/rcupdate.h>
 #include <linux/timer.h>
 #include <linux/sysctl.h>
 #include <linux/rtnetlink.h>
 
-enum
-{
-	IPV4_DEVCONF_FORWARDING=1,
-	IPV4_DEVCONF_MC_FORWARDING,
-	IPV4_DEVCONF_PROXY_ARP,
-	IPV4_DEVCONF_ACCEPT_REDIRECTS,
-	IPV4_DEVCONF_SECURE_REDIRECTS,
-	IPV4_DEVCONF_SEND_REDIRECTS,
-	IPV4_DEVCONF_SHARED_MEDIA,
-	IPV4_DEVCONF_RP_FILTER,
-	IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE,
-	IPV4_DEVCONF_BOOTP_RELAY,
-	IPV4_DEVCONF_LOG_MARTIANS,
-	IPV4_DEVCONF_TAG,
-	IPV4_DEVCONF_ARPFILTER,
-	IPV4_DEVCONF_MEDIUM_ID,
-	IPV4_DEVCONF_NOXFRM,
-	IPV4_DEVCONF_NOPOLICY,
-	IPV4_DEVCONF_FORCE_IGMP_VERSION,
-	IPV4_DEVCONF_ARP_ANNOUNCE,
-	IPV4_DEVCONF_ARP_IGNORE,
-	IPV4_DEVCONF_PROMOTE_SECONDARIES,
-	IPV4_DEVCONF_ARP_ACCEPT,
-	IPV4_DEVCONF_ARP_NOTIFY,
-	IPV4_DEVCONF_ACCEPT_LOCAL,
-	IPV4_DEVCONF_SRC_VMARK,
-	IPV4_DEVCONF_PROXY_ARP_PVLAN,
-	IPV4_DEVCONF_ROUTE_LOCALNET,
-	__IPV4_DEVCONF_MAX
-};
-
-#define IPV4_DEVCONF_MAX (__IPV4_DEVCONF_MAX - 1)
-
 struct ipv4_devconf {
 	void	*sysctl;
 	int	data[IPV4_DEVCONF_MAX];
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 6cf06bfd841b..2fee45bdec0a 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -133,4 +133,38 @@ struct ip_beet_phdr {
 	__u8 reserved;
 };
 
+/* index values for the variables in ipv4_devconf */
+enum
+{
+	IPV4_DEVCONF_FORWARDING=1,
+	IPV4_DEVCONF_MC_FORWARDING,
+	IPV4_DEVCONF_PROXY_ARP,
+	IPV4_DEVCONF_ACCEPT_REDIRECTS,
+	IPV4_DEVCONF_SECURE_REDIRECTS,
+	IPV4_DEVCONF_SEND_REDIRECTS,
+	IPV4_DEVCONF_SHARED_MEDIA,
+	IPV4_DEVCONF_RP_FILTER,
+	IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE,
+	IPV4_DEVCONF_BOOTP_RELAY,
+	IPV4_DEVCONF_LOG_MARTIANS,
+	IPV4_DEVCONF_TAG,
+	IPV4_DEVCONF_ARPFILTER,
+	IPV4_DEVCONF_MEDIUM_ID,
+	IPV4_DEVCONF_NOXFRM,
+	IPV4_DEVCONF_NOPOLICY,
+	IPV4_DEVCONF_FORCE_IGMP_VERSION,
+	IPV4_DEVCONF_ARP_ANNOUNCE,
+	IPV4_DEVCONF_ARP_IGNORE,
+	IPV4_DEVCONF_PROMOTE_SECONDARIES,
+	IPV4_DEVCONF_ARP_ACCEPT,
+	IPV4_DEVCONF_ARP_NOTIFY,
+	IPV4_DEVCONF_ACCEPT_LOCAL,
+	IPV4_DEVCONF_SRC_VMARK,
+	IPV4_DEVCONF_PROXY_ARP_PVLAN,
+	IPV4_DEVCONF_ROUTE_LOCALNET,
+	__IPV4_DEVCONF_MAX
+};
+
+#define IPV4_DEVCONF_MAX (__IPV4_DEVCONF_MAX - 1)
+
 #endif /* _UAPI_LINUX_IP_H */
-- 
cgit v1.2.3-71-gd317


From 118b23022512eb2f41ce42db70dc0568d00be4ba Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 24 Aug 2013 12:08:17 -0400
Subject: cope with potentially long ->d_dname() output for shmem/hugetlb

dynamic_dname() is both too much and too little for those - the
output may be well in excess of 64 bytes dynamic_dname() assumes
to be enough (thanks to ashmem feeding really long names to
shmem_file_setup()) and vsnprintf() is an overkill for those
guys.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 11 +++++++++++
 fs/hugetlbfs/inode.c   |  8 +-------
 include/linux/dcache.h |  1 +
 mm/shmem.c             |  8 +-------
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 87bdb5329c3c..83cfb834db03 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2724,6 +2724,17 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
 	return memcpy(buffer, temp, sz);
 }
 
+char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	char *end = buffer + buflen;
+	/* these dentries are never renamed, so d_lock is not needed */
+	if (prepend(&end, &buflen, " (deleted)", 11) ||
+	    prepend_name(&end, &buflen, &dentry->d_name) ||
+	    prepend(&end, &buflen, "/", 1))  
+		end = ERR_PTR(-ENAMETOOLONG);
+	return end;  
+}
+
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 34423978b170..d19b30ababf1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -926,14 +926,8 @@ static int get_hstate_idx(int page_size_log)
 	return h - hstates;
 }
 
-static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen)
-{
-	return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
-				dentry->d_name.name);
-}
-
 static struct dentry_operations anon_ops = {
-	.d_dname = hugetlb_dname
+	.d_dname = simple_dname
 };
 
 /*
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index b90337c9d468..4a12532da8c4 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -336,6 +336,7 @@ extern int d_validate(struct dentry *, struct dentry *);
  * helper function for dentry_operations.d_dname() members
  */
 extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
+extern char *simple_dname(struct dentry *, char *, int);
 
 extern char *__d_path(const struct path *, const struct path *, char *, int);
 extern char *d_absolute_path(const struct path *, char *, int);
diff --git a/mm/shmem.c b/mm/shmem.c
index 8335dbd3fc35..e43dc555069d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2909,14 +2909,8 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
 /* common code */
 
-static char *shmem_dname(struct dentry *dentry, char *buffer, int buflen)
-{
-	return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
-				dentry->d_name.name);
-}
-
 static struct dentry_operations anon_ops = {
-	.d_dname = shmem_dname
+	.d_dname = simple_dname
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From 0f8f2aaaab0b0f9c13635cb02e7d19bdaa9aa1bb Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hp.com>
Date: Wed, 28 Aug 2013 18:13:26 -0700
Subject: Add new lockref infrastructure reference implementation

This introduces a new "lockref" structure that supports the concept of
lockless updates of reference counts that still honor an attached
spinlock.

NOTE! This reference implementation is not the optimized lockless
version, rather it is the fallback implementation using standard
spinlocks.  The actual optimized versions will be merged into 3.12, but
I wanted to get the infrastructure in place and document the new
interfaces.

[ Also note that this particular commit is drastically cut-down minimal
  version of the original patch by Waiman.  In order to properly credit
  the original author I'm marking Waiman as the author here, but in the
  end this patch bears little resemblance to the patch by Waiman.  So
  blame any errors on me editing things down to the point where I can
  introduce the infrastructure before the merge window for 3.12 actually
  opens.     - Linus ]

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockref.h | 71 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 include/linux/lockref.h

(limited to 'include/linux')

diff --git a/include/linux/lockref.h b/include/linux/lockref.h
new file mode 100644
index 000000000000..01233e01627a
--- /dev/null
+++ b/include/linux/lockref.h
@@ -0,0 +1,71 @@
+#ifndef __LINUX_LOCKREF_H
+#define __LINUX_LOCKREF_H
+
+/*
+ * Locked reference counts.
+ *
+ * These are different from just plain atomic refcounts in that they
+ * are atomic with respect to the spinlock that goes with them.  In
+ * particular, there can be implementations that don't actually get
+ * the spinlock for the common decrement/increment operations, but they
+ * still have to check that the operation is done semantically as if
+ * the spinlock had been taken (using a cmpxchg operation that covers
+ * both the lock and the count word, or using memory transactions, for
+ * example).
+ */
+
+#include <linux/spinlock.h>
+
+struct lockref {
+	spinlock_t lock;
+	unsigned int count;
+};
+
+/**
+ * lockref_get - Increments reference count unconditionally
+ * @lockcnt: pointer to lockref structure
+ *
+ * This operation is only valid if you already hold a reference
+ * to the object, so you know the count cannot be zero.
+ */
+static inline void lockref_get(struct lockref *lockref)
+{
+	spin_lock(&lockref->lock);
+	lockref->count++;
+	spin_unlock(&lockref->lock);
+}
+
+/**
+ * lockref_get_not_zero - Increments count unless the count is 0
+ * @lockcnt: pointer to lockref structure
+ * Return: 1 if count updated successfully or 0 if count is 0
+ */
+static inline int lockref_get_not_zero(struct lockref *lockref)
+{
+	int retval = 0;
+
+	spin_lock(&lockref->lock);
+	if (lockref->count) {
+		lockref->count++;
+		retval = 1;
+	}
+	spin_unlock(&lockref->lock);
+	return retval;
+}
+
+/**
+ * lockref_put_or_lock - decrements count unless count <= 1 before decrement
+ * @lockcnt: pointer to lockref structure
+ * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
+ */
+static inline int lockref_put_or_lock(struct lockref *lockref)
+{
+	spin_lock(&lockref->lock);
+	if (lockref->count <= 1)
+		return 0;
+	lockref->count--;
+	spin_unlock(&lockref->lock);
+	return 1;
+}
+
+#endif /* __LINUX_LOCKREF_H */
-- 
cgit v1.2.3-71-gd317


From 98474236f72e5a8b89c14cd7c74f0bb77a4b1a99 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hp.com>
Date: Wed, 28 Aug 2013 18:24:59 -0700
Subject: vfs: make the dentry cache use the lockref infrastructure

This just replaces the dentry count/lock combination with the lockref
structure that contains both a count and a spinlock, and does the
mechanical conversion to use the lockref infrastructure.

There are no semantic changes here, it's purely syntactic.  The
reference lockref implementation uses the spinlock exactly the same way
that the old dcache code did, and the bulk of this patch is just
expanding the internal "d_count" use in the dcache code to use
"d_lockref.count" instead.

This is purely preparation for the real change to make the reference
count updates be lockless during the 3.12 merge window.

[ As with the previous commit, this is a rewritten version of a concept
  originally from Waiman, so credit goes to him, blame for any errors
  goes to me.

  Waiman's patch had some semantic differences for taking advantage of
  the lockless update in dget_parent(), while this patch is
  intentionally a pure search-and-replace change with no semantic
  changes.     - Linus ]

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c            | 57 ++++++++++++++++++++------------------------------
 fs/namei.c             |  6 +++---
 include/linux/dcache.h | 19 ++++++++---------
 3 files changed, 35 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 83cfb834db03..b949af850cd6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -229,7 +229,7 @@ static void __d_free(struct rcu_head *head)
  */
 static void d_free(struct dentry *dentry)
 {
-	BUG_ON(dentry->d_count);
+	BUG_ON(dentry->d_lockref.count);
 	this_cpu_dec(nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
@@ -467,7 +467,7 @@ relock:
 	}
 
 	if (ref)
-		dentry->d_count--;
+		dentry->d_lockref.count--;
 	/*
 	 * inform the fs via d_prune that this dentry is about to be
 	 * unhashed and destroyed.
@@ -513,15 +513,10 @@ void dput(struct dentry *dentry)
 		return;
 
 repeat:
-	if (dentry->d_count == 1)
+	if (dentry->d_lockref.count == 1)
 		might_sleep();
-	spin_lock(&dentry->d_lock);
-	BUG_ON(!dentry->d_count);
-	if (dentry->d_count > 1) {
-		dentry->d_count--;
-		spin_unlock(&dentry->d_lock);
+	if (lockref_put_or_lock(&dentry->d_lockref))
 		return;
-	}
 
 	if (dentry->d_flags & DCACHE_OP_DELETE) {
 		if (dentry->d_op->d_delete(dentry))
@@ -535,7 +530,7 @@ repeat:
 	dentry->d_flags |= DCACHE_REFERENCED;
 	dentry_lru_add(dentry);
 
-	dentry->d_count--;
+	dentry->d_lockref.count--;
 	spin_unlock(&dentry->d_lock);
 	return;
 
@@ -590,7 +585,7 @@ int d_invalidate(struct dentry * dentry)
 	 * We also need to leave mountpoints alone,
 	 * directory or not.
 	 */
-	if (dentry->d_count > 1 && dentry->d_inode) {
+	if (dentry->d_lockref.count > 1 && dentry->d_inode) {
 		if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
 			spin_unlock(&dentry->d_lock);
 			return -EBUSY;
@@ -606,14 +601,12 @@ EXPORT_SYMBOL(d_invalidate);
 /* This must be called with d_lock held */
 static inline void __dget_dlock(struct dentry *dentry)
 {
-	dentry->d_count++;
+	dentry->d_lockref.count++;
 }
 
 static inline void __dget(struct dentry *dentry)
 {
-	spin_lock(&dentry->d_lock);
-	__dget_dlock(dentry);
-	spin_unlock(&dentry->d_lock);
+	lockref_get(&dentry->d_lockref);
 }
 
 struct dentry *dget_parent(struct dentry *dentry)
@@ -634,8 +627,8 @@ repeat:
 		goto repeat;
 	}
 	rcu_read_unlock();
-	BUG_ON(!ret->d_count);
-	ret->d_count++;
+	BUG_ON(!ret->d_lockref.count);
+	ret->d_lockref.count++;
 	spin_unlock(&ret->d_lock);
 	return ret;
 }
@@ -718,7 +711,7 @@ restart:
 	spin_lock(&inode->i_lock);
 	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
-		if (!dentry->d_count) {
+		if (!dentry->d_lockref.count) {
 			__dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
@@ -763,12 +756,8 @@ static void try_prune_one_dentry(struct dentry *dentry)
 	/* Prune ancestors. */
 	dentry = parent;
 	while (dentry) {
-		spin_lock(&dentry->d_lock);
-		if (dentry->d_count > 1) {
-			dentry->d_count--;
-			spin_unlock(&dentry->d_lock);
+		if (lockref_put_or_lock(&dentry->d_lockref))
 			return;
-		}
 		dentry = dentry_kill(dentry, 1);
 	}
 }
@@ -793,7 +782,7 @@ static void shrink_dentry_list(struct list_head *list)
 		 * the LRU because of laziness during lookup.  Do not free
 		 * it - just keep it off the LRU list.
 		 */
-		if (dentry->d_count) {
+		if (dentry->d_lockref.count) {
 			dentry_lru_del(dentry);
 			spin_unlock(&dentry->d_lock);
 			continue;
@@ -913,7 +902,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			dentry_lru_del(dentry);
 			__d_shrink(dentry);
 
-			if (dentry->d_count != 0) {
+			if (dentry->d_lockref.count != 0) {
 				printk(KERN_ERR
 				       "BUG: Dentry %p{i=%lx,n=%s}"
 				       " still in use (%d)"
@@ -922,7 +911,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 				       dentry->d_inode ?
 				       dentry->d_inode->i_ino : 0UL,
 				       dentry->d_name.name,
-				       dentry->d_count,
+				       dentry->d_lockref.count,
 				       dentry->d_sb->s_type->name,
 				       dentry->d_sb->s_id);
 				BUG();
@@ -933,7 +922,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 				list_del(&dentry->d_u.d_child);
 			} else {
 				parent = dentry->d_parent;
-				parent->d_count--;
+				parent->d_lockref.count--;
 				list_del(&dentry->d_u.d_child);
 			}
 
@@ -981,7 +970,7 @@ void shrink_dcache_for_umount(struct super_block *sb)
 
 	dentry = sb->s_root;
 	sb->s_root = NULL;
-	dentry->d_count--;
+	dentry->d_lockref.count--;
 	shrink_dcache_for_umount_subtree(dentry);
 
 	while (!hlist_bl_empty(&sb->s_anon)) {
@@ -1147,7 +1136,7 @@ resume:
 		 * loop in shrink_dcache_parent() might not make any progress
 		 * and loop forever.
 		 */
-		if (dentry->d_count) {
+		if (dentry->d_lockref.count) {
 			dentry_lru_del(dentry);
 		} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 			dentry_lru_move_list(dentry, dispose);
@@ -1269,7 +1258,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	smp_wmb();
 	dentry->d_name.name = dname;
 
-	dentry->d_count = 1;
+	dentry->d_lockref.count = 1;
 	dentry->d_flags = 0;
 	spin_lock_init(&dentry->d_lock);
 	seqcount_init(&dentry->d_seq);
@@ -1970,7 +1959,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
 				goto next;
 		}
 
-		dentry->d_count++;
+		dentry->d_lockref.count++;
 		found = dentry;
 		spin_unlock(&dentry->d_lock);
 		break;
@@ -2069,7 +2058,7 @@ again:
 	spin_lock(&dentry->d_lock);
 	inode = dentry->d_inode;
 	isdir = S_ISDIR(inode->i_mode);
-	if (dentry->d_count == 1) {
+	if (dentry->d_lockref.count == 1) {
 		if (!spin_trylock(&inode->i_lock)) {
 			spin_unlock(&dentry->d_lock);
 			cpu_relax();
@@ -2948,7 +2937,7 @@ resume:
 		}
 		if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
 			dentry->d_flags |= DCACHE_GENOCIDE;
-			dentry->d_count--;
+			dentry->d_lockref.count--;
 		}
 		spin_unlock(&dentry->d_lock);
 	}
@@ -2956,7 +2945,7 @@ resume:
 		struct dentry *child = this_parent;
 		if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
 			this_parent->d_flags |= DCACHE_GENOCIDE;
-			this_parent->d_count--;
+			this_parent->d_lockref.count--;
 		}
 		this_parent = try_to_ascend(this_parent, locked, seq);
 		if (!this_parent)
diff --git a/fs/namei.c b/fs/namei.c
index 8b61d103a8a7..7720fbd5277b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -536,8 +536,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 		 * a reference at this point.
 		 */
 		BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
-		BUG_ON(!parent->d_count);
-		parent->d_count++;
+		BUG_ON(!parent->d_lockref.count);
+		parent->d_lockref.count++;
 		spin_unlock(&dentry->d_lock);
 	}
 	spin_unlock(&parent->d_lock);
@@ -3327,7 +3327,7 @@ void dentry_unhash(struct dentry *dentry)
 {
 	shrink_dcache_parent(dentry);
 	spin_lock(&dentry->d_lock);
-	if (dentry->d_count == 1)
+	if (dentry->d_lockref.count == 1)
 		__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 4a12532da8c4..efdc94434c30 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -9,6 +9,7 @@
 #include <linux/seqlock.h>
 #include <linux/cache.h>
 #include <linux/rcupdate.h>
+#include <linux/lockref.h>
 
 struct nameidata;
 struct path;
@@ -100,6 +101,8 @@ extern unsigned int full_name_hash(const unsigned char *, unsigned int);
 # endif
 #endif
 
+#define d_lock	d_lockref.lock
+
 struct dentry {
 	/* RCU lookup touched fields */
 	unsigned int d_flags;		/* protected by d_lock */
@@ -112,8 +115,7 @@ struct dentry {
 	unsigned char d_iname[DNAME_INLINE_LEN];	/* small names */
 
 	/* Ref lookup also touches following */
-	unsigned int d_count;		/* protected by d_lock */
-	spinlock_t d_lock;		/* per dentry lock */
+	struct lockref d_lockref;	/* per-dentry lock and refcount */
 	const struct dentry_operations *d_op;
 	struct super_block *d_sb;	/* The root of the dentry tree */
 	unsigned long d_time;		/* used by d_revalidate */
@@ -318,7 +320,7 @@ static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq)
 	assert_spin_locked(&dentry->d_lock);
 	if (!read_seqcount_retry(&dentry->d_seq, seq)) {
 		ret = 1;
-		dentry->d_count++;
+		dentry->d_lockref.count++;
 	}
 
 	return ret;
@@ -326,7 +328,7 @@ static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq)
 
 static inline unsigned d_count(const struct dentry *dentry)
 {
-	return dentry->d_count;
+	return dentry->d_lockref.count;
 }
 
 /* validate "insecure" dentry pointer */
@@ -357,17 +359,14 @@ extern char *dentry_path(struct dentry *, char *, int);
 static inline struct dentry *dget_dlock(struct dentry *dentry)
 {
 	if (dentry)
-		dentry->d_count++;
+		dentry->d_lockref.count++;
 	return dentry;
 }
 
 static inline struct dentry *dget(struct dentry *dentry)
 {
-	if (dentry) {
-		spin_lock(&dentry->d_lock);
-		dget_dlock(dentry);
-		spin_unlock(&dentry->d_lock);
-	}
+	if (dentry)
+		lockref_get(&dentry->d_lockref);
 	return dentry;
 }
 
-- 
cgit v1.2.3-71-gd317