From 6d21af4f7d0ab660b24c8635f4ed577f40cd2978 Mon Sep 17 00:00:00 2001
From: Javi Merino <javi.merino@arm.com>
Date: Wed, 26 Oct 2011 10:16:11 +0100
Subject: irq: Fix comment typo ist->is

Signed-off-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9b956fa20308..3261c4d478a2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1281,7 +1281,7 @@ EXPORT_SYMBOL(free_irq);
  *	and to set up the interrupt handler in the right order.
  *
  *	If you want to set up a threaded irq handler for your device
- *	then you need to supply @handler and @thread_fn. @handler ist
+ *	then you need to supply @handler and @thread_fn. @handler is
  *	still called in hard interrupt context and has to check
  *	whether the interrupt originates from the device. If yes it
  *	needs to disable the interrupt on the device and return
-- 
cgit v1.2.3-71-gd317


From 50fb4f7fc907efff65eadb0b74387a9ffed6e849 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:22 -0800
Subject: freezer: fix current->state restoration race in refrigerator()

refrigerator() saves current->state before entering frozen state and
restores it before returning using __set_current_state(); however,
this is racy, for example, please consider the following sequence.

	set_current_state(TASK_INTERRUPTIBLE);
	try_to_freeze();
	if (kthread_should_stop())
		break;
	schedule();

If kthread_stop() races with ->state restoration, the restoration can
restore ->state to TASK_INTERRUPTIBLE after kthread_stop() sets it to
TASK_RUNNING but kthread_should_stop() may still see zero
->should_stop because there's no memory barrier between restoring
TASK_INTERRUPTIBLE and kthread_should_stop() test.

This isn't restricted to kthread_should_stop().  current->state is
often used in memory barrier based synchronization and silently
restoring it w/o mb breaks them.

Use set_current_state() instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/freezer.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7be56c534397..3f460104a9d6 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -58,7 +58,13 @@ void refrigerator(void)
 	current->flags &= ~PF_FREEZING;
 
 	pr_debug("%s left refrigerator\n", current->comm);
-	__set_current_state(save);
+
+	/*
+	 * Restore saved task state before returning.  The mb'd version
+	 * needs to be used; otherwise, it might silently break
+	 * synchronization which depends on ordered task state change.
+	 */
+	set_current_state(save);
 }
 EXPORT_SYMBOL(refrigerator);
 
-- 
cgit v1.2.3-71-gd317


From a0acae0e886d44bd5ce6d2f173c1ace0fcf0d9f6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:22 -0800
Subject: freezer: unexport refrigerator() and update try_to_freeze() slightly

There is no reason to export two functions for entering the
refrigerator.  Calling refrigerator() instead of try_to_freeze()
doesn't save anything noticeable or removes any race condition.

* Rename refrigerator() to __refrigerator() and make it return bool
  indicating whether it scheduled out for freezing.

* Update try_to_freeze() to return bool and relay the return value of
  __refrigerator() if freezing().

* Convert all refrigerator() users to try_to_freeze().

* Update documentation accordingly.

* While at it, add might_sleep() to try_to_freeze().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Samuel Ortiz <samuel@sortiz.org>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp>
Cc: Christoph Hellwig <hch@infradead.org>
---
 Documentation/power/freezing-of-tasks.txt | 12 ++++++------
 drivers/net/irda/stir4200.c               |  2 +-
 fs/btrfs/async-thread.c                   |  2 +-
 fs/btrfs/disk-io.c                        |  8 ++------
 fs/ext4/super.c                           |  3 +--
 fs/gfs2/log.c                             |  4 ++--
 fs/gfs2/quota.c                           |  4 ++--
 fs/jbd/journal.c                          |  2 +-
 fs/jbd2/journal.c                         |  2 +-
 fs/jfs/jfs_logmgr.c                       |  2 +-
 fs/jfs/jfs_txnmgr.c                       |  4 ++--
 fs/nilfs2/segment.c                       |  2 +-
 fs/xfs/xfs_buf.c                          |  2 +-
 include/linux/freezer.h                   | 17 ++++++++---------
 kernel/freezer.c                          | 10 +++++++---
 15 files changed, 37 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
index 587e0828053f..3ab9fbd2800a 100644
--- a/Documentation/power/freezing-of-tasks.txt
+++ b/Documentation/power/freezing-of-tasks.txt
@@ -21,7 +21,7 @@ freeze_processes() (defined in kernel/power/process.c) is called.  It executes
 try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and
 either wakes them up, if they are kernel threads, or sends fake signals to them,
 if they are user space processes.  A task that has TIF_FREEZE set, should react
-to it by calling the function called refrigerator() (defined in
+to it by calling the function called __refrigerator() (defined in
 kernel/freezer.c), which sets the task's PF_FROZEN flag, changes its state
 to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it.
 Then, we say that the task is 'frozen' and therefore the set of functions
@@ -29,10 +29,10 @@ handling this mechanism is referred to as 'the freezer' (these functions are
 defined in kernel/power/process.c, kernel/freezer.c & include/linux/freezer.h).
 User space processes are generally frozen before kernel threads.
 
-It is not recommended to call refrigerator() directly.  Instead, it is
-recommended to use the try_to_freeze() function (defined in
-include/linux/freezer.h), that checks the task's TIF_FREEZE flag and makes the
-task enter refrigerator() if the flag is set.
+__refrigerator() must not be called directly.  Instead, use the
+try_to_freeze() function (defined in include/linux/freezer.h), that checks
+the task's TIF_FREEZE flag and makes the task enter __refrigerator() if the
+flag is set.
 
 For user space processes try_to_freeze() is called automatically from the
 signal-handling code, but the freezable kernel threads need to call it
@@ -61,7 +61,7 @@ wait_event_freezable() and wait_event_freezable_timeout() macros.
 After the system memory state has been restored from a hibernation image and
 devices have been reinitialized, the function thaw_processes() is called in
 order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
-have been frozen leave refrigerator() and continue running.
+have been frozen leave __refrigerator() and continue running.
 
 III. Which kernel threads are freezable?
 
diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c
index 41c96b3d8152..e880c79d7bd8 100644
--- a/drivers/net/irda/stir4200.c
+++ b/drivers/net/irda/stir4200.c
@@ -750,7 +750,7 @@ static int stir_transmit_thread(void *arg)
 
 			write_reg(stir, REG_CTRL1, CTRL1_TXPWD|CTRL1_RXPWD);
 
-			refrigerator();
+			try_to_freeze();
 
 			if (change_speed(stir, stir->speed))
 				break;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7ec14097fef1..98ab240072e5 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -340,7 +340,7 @@ again:
 		if (freezing(current)) {
 			worker->working = 0;
 			spin_unlock_irq(&worker->lock);
-			refrigerator();
+			try_to_freeze();
 		} else {
 			spin_unlock_irq(&worker->lock);
 			if (!kthread_should_stop()) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62afe5c5694e..622654fe051f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1579,9 +1579,7 @@ static int cleaner_kthread(void *arg)
 			btrfs_run_defrag_inodes(root->fs_info);
 		}
 
-		if (freezing(current)) {
-			refrigerator();
-		} else {
+		if (!try_to_freeze()) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!kthread_should_stop())
 				schedule();
@@ -1635,9 +1633,7 @@ sleep:
 		wake_up_process(root->fs_info->cleaner_kthread);
 		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
-		if (freezing(current)) {
-			refrigerator();
-		} else {
+		if (!try_to_freeze()) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!kthread_should_stop() &&
 			    !btrfs_transaction_blocked(root->fs_info))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9953d80145ad..877350ef0253 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2882,8 +2882,7 @@ cont_thread:
 		}
 		mutex_unlock(&eli->li_list_mtx);
 
-		if (freezing(current))
-			refrigerator();
+		try_to_freeze();
 
 		cur = jiffies;
 		if ((time_after_eq(cur, next_wakeup)) ||
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 598646434362..8154d42e4647 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -951,8 +951,8 @@ int gfs2_logd(void *data)
 			wake_up(&sdp->sd_log_waitq);
 
 		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
+
+		try_to_freeze();
 
 		do {
 			prepare_to_wait(&sdp->sd_logd_waitq, &wait,
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 7e528dc14f85..d49669e92652 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1427,8 +1427,8 @@ int gfs2_quotad(void *data)
 		/* Check for & recover partially truncated inodes */
 		quotad_check_trunc_list(sdp);
 
-		if (freezing(current))
-			refrigerator();
+		try_to_freeze();
+
 		t = min(quotad_timeo, statfs_timeo);
 
 		prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index fea8dd661d2b..a96cff0c5f1d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -166,7 +166,7 @@ loop:
 		 */
 		jbd_debug(1, "Now suspending kjournald\n");
 		spin_unlock(&journal->j_state_lock);
-		refrigerator();
+		try_to_freeze();
 		spin_lock(&journal->j_state_lock);
 	} else {
 		/*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0fa0123151d3..c0a5f9f1b127 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -173,7 +173,7 @@ loop:
 		 */
 		jbd_debug(1, "Now suspending kjournald2\n");
 		write_unlock(&journal->j_state_lock);
-		refrigerator();
+		try_to_freeze();
 		write_lock(&journal->j_state_lock);
 	} else {
 		/*
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index cc5f811ed383..2eb952c41a69 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2349,7 +2349,7 @@ int jfsIOWait(void *arg)
 
 		if (freezing(current)) {
 			spin_unlock_irq(&log_redrive_lock);
-			refrigerator();
+			try_to_freeze();
 		} else {
 			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&log_redrive_lock);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index af9606057dde..bb8b661bcc50 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2800,7 +2800,7 @@ int jfs_lazycommit(void *arg)
 
 		if (freezing(current)) {
 			LAZY_UNLOCK(flags);
-			refrigerator();
+			try_to_freeze();
 		} else {
 			DECLARE_WAITQUEUE(wq, current);
 
@@ -2994,7 +2994,7 @@ int jfs_sync(void *arg)
 
 		if (freezing(current)) {
 			TXN_UNLOCK();
-			refrigerator();
+			try_to_freeze();
 		} else {
 			set_current_state(TASK_INTERRUPTIBLE);
 			TXN_UNLOCK();
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index bb24ab6c282f..0e72ad6f22aa 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2470,7 +2470,7 @@ static int nilfs_segctor_thread(void *arg)
 
 	if (freezing(current)) {
 		spin_unlock(&sci->sc_state_lock);
-		refrigerator();
+		try_to_freeze();
 		spin_lock(&sci->sc_state_lock);
 	} else {
 		DEFINE_WAIT(wait);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index cf0ac056815f..018829936d6d 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1703,7 +1703,7 @@ xfsbufd(
 
 		if (unlikely(freezing(current))) {
 			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
-			refrigerator();
+			try_to_freeze();
 		} else {
 			clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
 		}
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index a5386e3ee756..7a9427e9fe47 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -47,18 +47,17 @@ static inline bool should_send_signal(struct task_struct *p)
 /* Takes and releases task alloc lock using task_lock() */
 extern int thaw_process(struct task_struct *p);
 
-extern void refrigerator(void);
+extern bool __refrigerator(void);
 extern int freeze_processes(void);
 extern int freeze_kernel_threads(void);
 extern void thaw_processes(void);
 
-static inline int try_to_freeze(void)
+static inline bool try_to_freeze(void)
 {
-	if (freezing(current)) {
-		refrigerator();
-		return 1;
-	} else
-		return 0;
+	might_sleep();
+	if (likely(!freezing(current)))
+		return false;
+	return __refrigerator();
 }
 
 extern bool freeze_task(struct task_struct *p, bool sig_only);
@@ -181,12 +180,12 @@ static inline void set_freeze_flag(struct task_struct *p) {}
 static inline void clear_freeze_flag(struct task_struct *p) {}
 static inline int thaw_process(struct task_struct *p) { return 1; }
 
-static inline void refrigerator(void) {}
+static inline bool __refrigerator(void) { return false; }
 static inline int freeze_processes(void) { return -ENOSYS; }
 static inline int freeze_kernel_threads(void) { return -ENOSYS; }
 static inline void thaw_processes(void) {}
 
-static inline int try_to_freeze(void) { return 0; }
+static inline bool try_to_freeze(void) { return false; }
 
 static inline void freezer_do_not_count(void) {}
 static inline void freezer_count(void) {}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 3f460104a9d6..732f14f5944f 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -23,10 +23,11 @@ static inline void frozen_process(void)
 }
 
 /* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(void)
+bool __refrigerator(void)
 {
 	/* Hmm, should we be allowed to suspend when there are realtime
 	   processes around? */
+	bool was_frozen = false;
 	long save;
 
 	task_lock(current);
@@ -35,7 +36,7 @@ void refrigerator(void)
 		task_unlock(current);
 	} else {
 		task_unlock(current);
-		return;
+		return was_frozen;
 	}
 	save = current->state;
 	pr_debug("%s entered refrigerator\n", current->comm);
@@ -51,6 +52,7 @@ void refrigerator(void)
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!frozen(current))
 			break;
+		was_frozen = true;
 		schedule();
 	}
 
@@ -65,8 +67,10 @@ void refrigerator(void)
 	 * synchronization which depends on ordered task state change.
 	 */
 	set_current_state(save);
+
+	return was_frozen;
 }
-EXPORT_SYMBOL(refrigerator);
+EXPORT_SYMBOL(__refrigerator);
 
 static void fake_signal_wake_up(struct task_struct *p)
 {
-- 
cgit v1.2.3-71-gd317


From 8a32c441c1609f80e55df75422324a1151208f40 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:23 -0800
Subject: freezer: implement and use kthread_freezable_should_stop()

Writeback and thinkpad_acpi have been using thaw_process() to prevent
deadlock between the freezer and kthread_stop(); unfortunately, this
is inherently racy - nothing prevents freezing from happening between
thaw_process() and kthread_stop().

This patch implements kthread_freezable_should_stop() which enters
refrigerator if necessary but is guaranteed to return if
kthread_stop() is invoked.  Both thaw_process() users are converted to
use the new function.

Note that this deadlock condition exists for many of freezable
kthreads.  They need to be converted to use the new should_stop or
freezable workqueue.

Tested with synthetic test case.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Henrique de Moraes Holschuh <ibm-acpi@hmh.eng.br>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 drivers/platform/x86/thinkpad_acpi.c | 15 ++++++---------
 fs/fs-writeback.c                    |  4 +---
 include/linux/freezer.h              |  6 +++---
 include/linux/kthread.h              |  1 +
 kernel/freezer.c                     |  6 ++++--
 kernel/kthread.c                     | 25 +++++++++++++++++++++++++
 mm/backing-dev.c                     |  8 ++------
 7 files changed, 42 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 7b828680b21d..4b11fc91fa7d 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -2456,8 +2456,9 @@ static int hotkey_kthread(void *data)
 	u32 poll_mask, event_mask;
 	unsigned int si, so;
 	unsigned long t;
-	unsigned int change_detector, must_reset;
+	unsigned int change_detector;
 	unsigned int poll_freq;
+	bool was_frozen;
 
 	mutex_lock(&hotkey_thread_mutex);
 
@@ -2488,14 +2489,14 @@ static int hotkey_kthread(void *data)
 				t = 100;	/* should never happen... */
 		}
 		t = msleep_interruptible(t);
-		if (unlikely(kthread_should_stop()))
+		if (unlikely(kthread_freezable_should_stop(&was_frozen)))
 			break;
-		must_reset = try_to_freeze();
-		if (t > 0 && !must_reset)
+
+		if (t > 0 && !was_frozen)
 			continue;
 
 		mutex_lock(&hotkey_thread_data_mutex);
-		if (must_reset || hotkey_config_change != change_detector) {
+		if (was_frozen || hotkey_config_change != change_detector) {
 			/* forget old state on thaw or config change */
 			si = so;
 			t = 0;
@@ -2528,10 +2529,6 @@ exit:
 static void hotkey_poll_stop_sync(void)
 {
 	if (tpacpi_hotkey_task) {
-		if (frozen(tpacpi_hotkey_task) ||
-		    freezing(tpacpi_hotkey_task))
-			thaw_process(tpacpi_hotkey_task);
-
 		kthread_stop(tpacpi_hotkey_task);
 		tpacpi_hotkey_task = NULL;
 		mutex_lock(&hotkey_thread_mutex);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 73c3992b2bb4..271fde50f0ee 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -947,7 +947,7 @@ int bdi_writeback_thread(void *data)
 
 	trace_writeback_thread_start(bdi);
 
-	while (!kthread_should_stop()) {
+	while (!kthread_freezable_should_stop(NULL)) {
 		/*
 		 * Remove own delayed wake-up timer, since we are already awake
 		 * and we'll take care of the preriodic write-back.
@@ -977,8 +977,6 @@ int bdi_writeback_thread(void *data)
 			 */
 			schedule();
 		}
-
-		try_to_freeze();
 	}
 
 	/* Flush any work that raced with us exiting */
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 7a9427e9fe47..d02b78448b0f 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -47,7 +47,7 @@ static inline bool should_send_signal(struct task_struct *p)
 /* Takes and releases task alloc lock using task_lock() */
 extern int thaw_process(struct task_struct *p);
 
-extern bool __refrigerator(void);
+extern bool __refrigerator(bool check_kthr_stop);
 extern int freeze_processes(void);
 extern int freeze_kernel_threads(void);
 extern void thaw_processes(void);
@@ -57,7 +57,7 @@ static inline bool try_to_freeze(void)
 	might_sleep();
 	if (likely(!freezing(current)))
 		return false;
-	return __refrigerator();
+	return __refrigerator(false);
 }
 
 extern bool freeze_task(struct task_struct *p, bool sig_only);
@@ -180,7 +180,7 @@ static inline void set_freeze_flag(struct task_struct *p) {}
 static inline void clear_freeze_flag(struct task_struct *p) {}
 static inline int thaw_process(struct task_struct *p) { return 1; }
 
-static inline bool __refrigerator(void) { return false; }
+static inline bool __refrigerator(bool check_kthr_stop) { return false; }
 static inline int freeze_processes(void) { return -ENOSYS; }
 static inline int freeze_kernel_threads(void) { return -ENOSYS; }
 static inline void thaw_processes(void) {}
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 5cac19b3a266..0714b24c0e45 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -35,6 +35,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 int kthread_stop(struct task_struct *k);
 int kthread_should_stop(void);
+bool kthread_freezable_should_stop(bool *was_frozen);
 void *kthread_data(struct task_struct *k);
 
 int kthreadd(void *unused);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 732f14f5944f..b83c30e9483a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -9,6 +9,7 @@
 #include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
+#include <linux/kthread.h>
 
 /*
  * freezing is complete, mark current process as frozen
@@ -23,7 +24,7 @@ static inline void frozen_process(void)
 }
 
 /* Refrigerator is place where frozen processes are stored :-). */
-bool __refrigerator(void)
+bool __refrigerator(bool check_kthr_stop)
 {
 	/* Hmm, should we be allowed to suspend when there are realtime
 	   processes around? */
@@ -50,7 +51,8 @@ bool __refrigerator(void)
 
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!frozen(current))
+		if (!frozen(current) ||
+		    (check_kthr_stop && kthread_should_stop()))
 			break;
 		was_frozen = true;
 		schedule();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b6d216a92639..1c36deaae2f1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -58,6 +58,31 @@ int kthread_should_stop(void)
 }
 EXPORT_SYMBOL(kthread_should_stop);
 
+/**
+ * kthread_freezable_should_stop - should this freezable kthread return now?
+ * @was_frozen: optional out parameter, indicates whether %current was frozen
+ *
+ * kthread_should_stop() for freezable kthreads, which will enter
+ * refrigerator if necessary.  This function is safe from kthread_stop() /
+ * freezer deadlock and freezable kthreads should use this function instead
+ * of calling try_to_freeze() directly.
+ */
+bool kthread_freezable_should_stop(bool *was_frozen)
+{
+	bool frozen = false;
+
+	might_sleep();
+
+	if (unlikely(freezing(current)))
+		frozen = __refrigerator(true);
+
+	if (was_frozen)
+		*was_frozen = frozen;
+
+	return kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
+
 /**
  * kthread_data - return data value specified on kthread creation
  * @task: kthread task in question
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 71034f41a2ba..7ba8feae11b8 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -600,14 +600,10 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 
 	/*
 	 * Finally, kill the kernel thread. We don't need to be RCU
-	 * safe anymore, since the bdi is gone from visibility. Force
-	 * unfreeze of the thread before calling kthread_stop(), otherwise
-	 * it would never exet if it is currently stuck in the refrigerator.
+	 * safe anymore, since the bdi is gone from visibility.
 	 */
-	if (bdi->wb.task) {
-		thaw_process(bdi->wb.task);
+	if (bdi->wb.task)
 		kthread_stop(bdi->wb.task);
-	}
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From a5be2d0d1a8746e7be5210e3d6b904455000443c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:23 -0800
Subject: freezer: rename thaw_process() to __thaw_task() and simplify the
 implementation

thaw_process() now has only internal users - system and cgroup
freezers.  Remove the unnecessary return value, rename, unexport and
collapse __thaw_process() into it.  This will help further updates to
the freezer code.

-v3: oom_kill grew a use of thaw_process() while this patch was
     pending.  Convert it to use __thaw_task() for now.  In the longer
     term, this should be handled by allowing tasks to die if killed
     even if it's frozen.

-v2: minor style update as suggested by Matt.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Paul Menage <menage@google.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
---
 include/linux/freezer.h |  3 +--
 kernel/cgroup_freezer.c |  7 +++----
 kernel/freezer.c        | 31 ++++++++++++-------------------
 kernel/power/process.c  |  2 +-
 mm/oom_kill.c           |  2 +-
 5 files changed, 18 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index d02b78448b0f..ba4f512d2938 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -45,7 +45,7 @@ static inline bool should_send_signal(struct task_struct *p)
 }
 
 /* Takes and releases task alloc lock using task_lock() */
-extern int thaw_process(struct task_struct *p);
+extern void __thaw_task(struct task_struct *t);
 
 extern bool __refrigerator(bool check_kthr_stop);
 extern int freeze_processes(void);
@@ -178,7 +178,6 @@ static inline int frozen(struct task_struct *p) { return 0; }
 static inline int freezing(struct task_struct *p) { return 0; }
 static inline void set_freeze_flag(struct task_struct *p) {}
 static inline void clear_freeze_flag(struct task_struct *p) {}
-static inline int thaw_process(struct task_struct *p) { return 1; }
 
 static inline bool __refrigerator(bool check_kthr_stop) { return false; }
 static inline int freeze_processes(void) { return -ENOSYS; }
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 5e828a2ca8e6..a6d405a86ee0 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -130,7 +130,7 @@ struct cgroup_subsys freezer_subsys;
  *   write_lock css_set_lock (cgroup iterator start)
  *    task->alloc_lock
  *   read_lock css_set_lock (cgroup iterator start)
- *    task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
+ *    task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
  *     sighand->siglock
  */
 static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -300,9 +300,8 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 	struct task_struct *task;
 
 	cgroup_iter_start(cgroup, &it);
-	while ((task = cgroup_iter_next(cgroup, &it))) {
-		thaw_process(task);
-	}
+	while ((task = cgroup_iter_next(cgroup, &it)))
+		__thaw_task(task);
 	cgroup_iter_end(cgroup, &it);
 
 	freezer->state = CGROUP_THAWED;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b83c30e9483a..c851d588e29f 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -145,18 +145,8 @@ void cancel_freezing(struct task_struct *p)
 	}
 }
 
-static int __thaw_process(struct task_struct *p)
-{
-	if (frozen(p)) {
-		p->flags &= ~PF_FROZEN;
-		return 1;
-	}
-	clear_freeze_flag(p);
-	return 0;
-}
-
 /*
- * Wake up a frozen process
+ * Wake up a frozen task
  *
  * task_lock() is needed to prevent the race with refrigerator() which may
  * occur if the freezing of tasks fails.  Namely, without the lock, if the
@@ -164,15 +154,18 @@ static int __thaw_process(struct task_struct *p)
  * refrigerator() could call frozen_process(), in which case the task would be
  * frozen and no one would thaw it.
  */
-int thaw_process(struct task_struct *p)
+void __thaw_task(struct task_struct *p)
 {
+	bool was_frozen;
+
 	task_lock(p);
-	if (__thaw_process(p) == 1) {
-		task_unlock(p);
-		wake_up_process(p);
-		return 1;
-	}
+	was_frozen = frozen(p);
+	if (was_frozen)
+		p->flags &= ~PF_FROZEN;
+	else
+		clear_freeze_flag(p);
 	task_unlock(p);
-	return 0;
+
+	if (was_frozen)
+		wake_up_process(p);
 }
-EXPORT_SYMBOL(thaw_process);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index addbbe5531bc..fe2787207f00 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -186,7 +186,7 @@ static void thaw_tasks(bool nosig_only)
 		if (cgroup_freezing_or_frozen(p))
 			continue;
 
-		thaw_process(p);
+		__thaw_task(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 76f2c5ae908e..3134ee2fb2e8 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -328,7 +328,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 		 */
 		if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
 			if (unlikely(frozen(p)))
-				thaw_process(p);
+				__thaw_task(p);
 			return ERR_PTR(-1UL);
 		}
 		if (!p->mm)
-- 
cgit v1.2.3-71-gd317


From a585042f7b933539a0b6bc63650c2d49ffb2e55d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:23 -0800
Subject: freezer: remove racy clear_freeze_flag() and set PF_NOFREEZE on dead
 tasks

clear_freeze_flag() in exit_mm() is racy.  Freezing can start
afterwards.  Remove it.  Skipping freezer for exiting task will be
properly implemented later.

Also, freezable() was testing exit_state directly to make system
freezer ignore dead tasks.  Let the exiting task set PF_NOFREEZE after
entering TASK_DEAD instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 kernel/exit.c          | 3 +--
 kernel/power/process.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index d0b7d988f873..95a4141d07e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -679,8 +679,6 @@ static void exit_mm(struct task_struct * tsk)
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
-	/* We don't want this task to be frozen prematurely */
-	clear_freeze_flag(tsk);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
@@ -1040,6 +1038,7 @@ NORET_TYPE void do_exit(long code)
 	exit_rcu();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
+	tsk->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index fe2787207f00..23822dc14b6c 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -25,8 +25,7 @@
 static inline int freezable(struct task_struct * p)
 {
 	if ((p == current) ||
-	    (p->flags & PF_NOFREEZE) ||
-	    (p->exit_state != 0))
+	    (p->flags & PF_NOFREEZE))
 		return 0;
 	return 1;
 }
-- 
cgit v1.2.3-71-gd317


From 6cd8dedcdd8e8de01391a7cf25f0b2afeb24f8f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:23 -0800
Subject: freezer: don't distinguish nosig tasks on thaw

There's no point in thawing nosig tasks before others.  There's no
ordering requirement between the two groups on thaw, which the staged
thawing can't guarantee anyway.  Simplify thaw_processes() by removing
the distinction and collapsing thaw_tasks() into thaw_processes().
This will help further updates to freezer.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/power/process.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 23822dc14b6c..9db048fb0d70 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -170,34 +170,28 @@ int freeze_kernel_threads(void)
 	return error;
 }
 
-static void thaw_tasks(bool nosig_only)
+void thaw_processes(void)
 {
 	struct task_struct *g, *p;
 
+	oom_killer_enable();
+
+	printk("Restarting tasks ... ");
+
+	thaw_workqueues();
+
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		if (!freezable(p))
 			continue;
 
-		if (nosig_only && should_send_signal(p))
-			continue;
-
 		if (cgroup_freezing_or_frozen(p))
 			continue;
 
 		__thaw_task(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
-}
-
-void thaw_processes(void)
-{
-	oom_killer_enable();
 
-	printk("Restarting tasks ... ");
-	thaw_workqueues();
-	thaw_tasks(true);
-	thaw_tasks(false);
 	schedule();
 	printk("done.\n");
 }
-- 
cgit v1.2.3-71-gd317


From 0c9af09262864a2744091ee94c98c4a8fd60c98b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:24 -0800
Subject: freezer: use dedicated lock instead of task_lock() + memory barrier

Freezer synchronization is needlessly complicated - it's by no means a
hot path and the priority is staying unintrusive and safe.  This patch
makes it simply use a dedicated lock instead of piggy-backing on
task_lock() and playing with memory barriers.

On the failure path of try_to_freeze_tasks(), locking is moved from it
to cancel_freezing().  This makes the frozen() test racy but the race
here is a non-issue as the warning is printed for tasks which failed
to enter frozen for 20 seconds and race on PF_FROZEN at the last
moment doesn't change anything.

This simplifies freezer implementation and eases further changes
including some race fixes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/freezer.c       | 84 ++++++++++++++++++++++----------------------------
 kernel/power/process.c |  2 --
 2 files changed, 37 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index c851d588e29f..4130e48649bb 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -11,17 +11,8 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 
-/*
- * freezing is complete, mark current process as frozen
- */
-static inline void frozen_process(void)
-{
-	if (!unlikely(current->flags & PF_NOFREEZE)) {
-		current->flags |= PF_FROZEN;
-		smp_wmb();
-	}
-	clear_freeze_flag(current);
-}
+/* protects freezing and frozen transitions */
+static DEFINE_SPINLOCK(freezer_lock);
 
 /* Refrigerator is place where frozen processes are stored :-). */
 bool __refrigerator(bool check_kthr_stop)
@@ -31,14 +22,16 @@ bool __refrigerator(bool check_kthr_stop)
 	bool was_frozen = false;
 	long save;
 
-	task_lock(current);
-	if (freezing(current)) {
-		frozen_process();
-		task_unlock(current);
-	} else {
-		task_unlock(current);
+	spin_lock_irq(&freezer_lock);
+	if (!freezing(current)) {
+		spin_unlock_irq(&freezer_lock);
 		return was_frozen;
 	}
+	if (!(current->flags & PF_NOFREEZE))
+		current->flags |= PF_FROZEN;
+	clear_freeze_flag(current);
+	spin_unlock_irq(&freezer_lock);
+
 	save = current->state;
 	pr_debug("%s entered refrigerator\n", current->comm);
 
@@ -99,21 +92,18 @@ static void fake_signal_wake_up(struct task_struct *p)
  */
 bool freeze_task(struct task_struct *p, bool sig_only)
 {
-	/*
-	 * We first check if the task is freezing and next if it has already
-	 * been frozen to avoid the race with frozen_process() which first marks
-	 * the task as frozen and next clears its TIF_FREEZE.
-	 */
-	if (!freezing(p)) {
-		smp_rmb();
-		if (frozen(p))
-			return false;
-
-		if (!sig_only || should_send_signal(p))
-			set_freeze_flag(p);
-		else
-			return false;
-	}
+	unsigned long flags;
+	bool ret = false;
+
+	spin_lock_irqsave(&freezer_lock, flags);
+
+	if (sig_only && !should_send_signal(p))
+		goto out_unlock;
+
+	if (frozen(p))
+		goto out_unlock;
+
+	set_freeze_flag(p);
 
 	if (should_send_signal(p)) {
 		fake_signal_wake_up(p);
@@ -123,26 +113,28 @@ bool freeze_task(struct task_struct *p, bool sig_only)
 		 * TASK_RUNNING transition can't race with task state
 		 * testing in try_to_freeze_tasks().
 		 */
-	} else if (sig_only) {
-		return false;
 	} else {
 		wake_up_state(p, TASK_INTERRUPTIBLE);
 	}
-
-	return true;
+	ret = true;
+out_unlock:
+	spin_unlock_irqrestore(&freezer_lock, flags);
+	return ret;
 }
 
 void cancel_freezing(struct task_struct *p)
 {
 	unsigned long flags;
 
+	spin_lock_irqsave(&freezer_lock, flags);
 	if (freezing(p)) {
 		pr_debug("  clean up: %s\n", p->comm);
 		clear_freeze_flag(p);
-		spin_lock_irqsave(&p->sighand->siglock, flags);
+		spin_lock(&p->sighand->siglock);
 		recalc_sigpending_and_wake(p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		spin_unlock(&p->sighand->siglock);
 	}
+	spin_unlock_irqrestore(&freezer_lock, flags);
 }
 
 /*
@@ -156,16 +148,14 @@ void cancel_freezing(struct task_struct *p)
  */
 void __thaw_task(struct task_struct *p)
 {
-	bool was_frozen;
+	unsigned long flags;
 
-	task_lock(p);
-	was_frozen = frozen(p);
-	if (was_frozen)
+	spin_lock_irqsave(&freezer_lock, flags);
+	if (frozen(p)) {
 		p->flags &= ~PF_FROZEN;
-	else
-		clear_freeze_flag(p);
-	task_unlock(p);
-
-	if (was_frozen)
 		wake_up_process(p);
+	} else {
+		clear_freeze_flag(p);
+	}
+	spin_unlock_irqrestore(&freezer_lock, flags);
 }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 9db048fb0d70..bd420ca48261 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -118,11 +118,9 @@ static int try_to_freeze_tasks(bool sig_only)
 
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
-			task_lock(p);
 			if (!wakeup && freezing(p) && !freezer_should_skip(p))
 				sched_show_task(p);
 			cancel_freezing(p);
-			task_unlock(p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 	} else {
-- 
cgit v1.2.3-71-gd317


From 6907483b4e803a20f0b48cc9afa3817420ce61c5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:24 -0800
Subject: freezer: make freezing indicate freeze condition in effect

Currently freezing (TIF_FREEZE) and frozen (PF_FROZEN) states are
interlocked - freezing is set to request freeze and when the task
actually freezes, it clears freezing and sets frozen.

This interlocking makes things more complex than necessary - freezing
doesn't mean there's freezing condition in effect and frozen doesn't
match the task actually entering and leaving frozen state (it's
cleared by the thawing task).

This patch makes freezing indicate that freeze condition is in effect.
A task enters and stays frozen if freezing.  This makes PF_FROZEN
manipulation done only by the task itself and prevents wakeup from
__thaw_task() leaking outside of refrigerator.

The only place which needs to tell freezing && !frozen is
try_to_freeze_task() to whine about tasks which don't enter frozen.
It's updated to test the condition explicitly.

With the change, frozen() state my linger after __thaw_task() until
the task wakes up and exits fridge.  This can trigger BUG_ON() in
update_if_frozen().  Work it around by testing freezing() && frozen()
instead of frozen().

-v2: Oleg pointed out missing re-check of freezing() when trying to
     clear FROZEN and possible spurious BUG_ON() trigger in
     update_if_frozen().  Both fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/cgroup_freezer.c |  2 +-
 kernel/freezer.c        | 42 ++++++++++++++++++++++++------------------
 kernel/power/process.c  |  3 ++-
 3 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index a6d405a86ee0..cd27b0825560 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -231,7 +231,7 @@ static void update_if_frozen(struct cgroup *cgroup,
 	cgroup_iter_start(cgroup, &it);
 	while ((task = cgroup_iter_next(cgroup, &it))) {
 		ntotal++;
-		if (frozen(task))
+		if (freezing(task) && frozen(task))
 			nfrozen++;
 	}
 
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 4130e48649bb..a8822be43da0 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -22,14 +22,19 @@ bool __refrigerator(bool check_kthr_stop)
 	bool was_frozen = false;
 	long save;
 
+	/*
+	 * Enter FROZEN.  If NOFREEZE, schedule immediate thawing by
+	 * clearing freezing.
+	 */
 	spin_lock_irq(&freezer_lock);
+repeat:
 	if (!freezing(current)) {
 		spin_unlock_irq(&freezer_lock);
 		return was_frozen;
 	}
-	if (!(current->flags & PF_NOFREEZE))
-		current->flags |= PF_FROZEN;
-	clear_freeze_flag(current);
+	if (current->flags & PF_NOFREEZE)
+		clear_freeze_flag(current);
+	current->flags |= PF_FROZEN;
 	spin_unlock_irq(&freezer_lock);
 
 	save = current->state;
@@ -44,7 +49,7 @@ bool __refrigerator(bool check_kthr_stop)
 
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!frozen(current) ||
+		if (!freezing(current) ||
 		    (check_kthr_stop && kthread_should_stop()))
 			break;
 		was_frozen = true;
@@ -54,6 +59,13 @@ bool __refrigerator(bool check_kthr_stop)
 	/* Remove the accounting blocker */
 	current->flags &= ~PF_FREEZING;
 
+	/* leave FROZEN */
+	spin_lock_irq(&freezer_lock);
+	if (freezing(current))
+		goto repeat;
+	current->flags &= ~PF_FROZEN;
+	spin_unlock_irq(&freezer_lock);
+
 	pr_debug("%s left refrigerator\n", current->comm);
 
 	/*
@@ -137,25 +149,19 @@ void cancel_freezing(struct task_struct *p)
 	spin_unlock_irqrestore(&freezer_lock, flags);
 }
 
-/*
- * Wake up a frozen task
- *
- * task_lock() is needed to prevent the race with refrigerator() which may
- * occur if the freezing of tasks fails.  Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
- */
 void __thaw_task(struct task_struct *p)
 {
 	unsigned long flags;
 
+	/*
+	 * Clear freezing and kick @p if FROZEN.  Clearing is guaranteed to
+	 * be visible to @p as waking up implies wmb.  Waking up inside
+	 * freezer_lock also prevents wakeups from leaking outside
+	 * refrigerator.
+	 */
 	spin_lock_irqsave(&freezer_lock, flags);
-	if (frozen(p)) {
-		p->flags &= ~PF_FROZEN;
+	clear_freeze_flag(p);
+	if (frozen(p))
 		wake_up_process(p);
-	} else {
-		clear_freeze_flag(p);
-	}
 	spin_unlock_irqrestore(&freezer_lock, flags);
 }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index bd420ca48261..e6e2739190b5 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -118,7 +118,8 @@ static int try_to_freeze_tasks(bool sig_only)
 
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
-			if (!wakeup && freezing(p) && !freezer_should_skip(p))
+			if (!wakeup && !freezer_should_skip(p) &&
+			    freezing(p) && !frozen(p))
 				sched_show_task(p);
 			cancel_freezing(p);
 		} while_each_thread(g, p);
-- 
cgit v1.2.3-71-gd317


From 85f1d476653f52c97ca75466b2494e67c1cbd25d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:24 -0800
Subject: freezer: test freezable conditions while holding freezer_lock

try_to_freeze_tasks() and thaw_processes() use freezable() and
frozen() as preliminary tests before initiating operations on a task.
These are done without any synchronization and hinder with
synchronization cleanup without any real performance benefits.

In try_to_freeze_tasks(), open code self test and move PF_NOFREEZE and
frozen() tests inside freezer_lock in freeze_task().

thaw_processes() can simply drop freezable() test as frozen() test in
__thaw_task() is enough.

Note: This used to be a part of larger patch to fix set_freezable()
      race.  Separated out to satisfy ordering among dependent fixes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 kernel/freezer.c       |  3 ++-
 kernel/power/process.c | 16 +---------------
 2 files changed, 3 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8822be43da0..a257ecd37c48 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -109,7 +109,8 @@ bool freeze_task(struct task_struct *p, bool sig_only)
 
 	spin_lock_irqsave(&freezer_lock, flags);
 
-	if (sig_only && !should_send_signal(p))
+	if ((p->flags & PF_NOFREEZE) ||
+	    (sig_only && !should_send_signal(p)))
 		goto out_unlock;
 
 	if (frozen(p))
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e6e2739190b5..e59676f5811d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,14 +22,6 @@
  */
 #define TIMEOUT	(20 * HZ)
 
-static inline int freezable(struct task_struct * p)
-{
-	if ((p == current) ||
-	    (p->flags & PF_NOFREEZE))
-		return 0;
-	return 1;
-}
-
 static int try_to_freeze_tasks(bool sig_only)
 {
 	struct task_struct *g, *p;
@@ -52,10 +44,7 @@ static int try_to_freeze_tasks(bool sig_only)
 		todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
-			if (frozen(p) || !freezable(p))
-				continue;
-
-			if (!freeze_task(p, sig_only))
+			if (p == current || !freeze_task(p, sig_only))
 				continue;
 
 			/*
@@ -181,9 +170,6 @@ void thaw_processes(void)
 
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
-		if (!freezable(p))
-			continue;
-
 		if (cgroup_freezing_or_frozen(p))
 			continue;
 
-- 
cgit v1.2.3-71-gd317


From 376fede80e74d98b49d1ba9ac18f23c9fd026ddd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:24 -0800
Subject: freezer: kill PF_FREEZING

With the previous changes, there's no meaningful difference between
PF_FREEZING and PF_FROZEN.  Remove PF_FREEZING and use PF_FROZEN
instead in task_contributes_to_load().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched.h | 3 +--
 kernel/freezer.c      | 6 ------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68daf4f27e2c..d12bd03b688f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -220,7 +220,7 @@ extern char ___assert_task_state[1 - 2*!!(
 			((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 #define task_contributes_to_load(task)	\
 				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
-				 (task->flags & PF_FREEZING) == 0)
+				 (task->flags & PF_FROZEN) == 0)
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -1773,7 +1773,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_NPROC_EXCEEDED 0x00001000	/* set_user noticed that RLIMIT_NPROC was exceeded */
 #define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
-#define PF_FREEZING	0x00004000	/* freeze in progress. do not account to load */
 #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
diff --git a/kernel/freezer.c b/kernel/freezer.c
index a257ecd37c48..b8b562124ba9 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,9 +44,6 @@ repeat:
 	recalc_sigpending(); /* We sent fake signal, clean it up */
 	spin_unlock_irq(&current->sighand->siglock);
 
-	/* prevent accounting of that task to load */
-	current->flags |= PF_FREEZING;
-
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!freezing(current) ||
@@ -56,9 +53,6 @@ repeat:
 		schedule();
 	}
 
-	/* Remove the accounting blocker */
-	current->flags &= ~PF_FREEZING;
-
 	/* leave FROZEN */
 	spin_lock_irq(&freezer_lock);
 	if (freezing(current))
-- 
cgit v1.2.3-71-gd317


From 03afed8bc296fa70186ba832c1126228bb992465 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:24 -0800
Subject: freezer: clean up freeze_processes() failure path

freeze_processes() failure path is rather messy.  Freezing is canceled
for workqueues and tasks which aren't frozen yet but frozen tasks are
left alone and should be thawed by the caller and of course some
callers (xen and kexec) didn't do it.

This patch updates __thaw_task() to handle cancelation correctly and
makes freeze_processes() and freeze_kernel_threads() call
thaw_processes() on failure instead so that the system is fully thawed
on failure.  Unnecessary [suspend_]thaw_processes() calls are removed
from kernel/power/hibernate.c, suspend.c and user.c.

While at it, restructure error checking if clause in suspend_prepare()
to be less weird.

-v2: Srivatsa spotted missing removal of suspend_thaw_processes() in
     suspend_prepare() and error in commit message.  Updated.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---
 include/linux/freezer.h  |  1 -
 kernel/freezer.c         | 25 +++++++++----------------
 kernel/power/hibernate.c | 15 ++-------------
 kernel/power/process.c   | 16 ++++++++--------
 kernel/power/suspend.c   |  8 +++-----
 kernel/power/user.c      |  4 +---
 6 files changed, 23 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index ba4f512d2938..93f411a52872 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -61,7 +61,6 @@ static inline bool try_to_freeze(void)
 }
 
 extern bool freeze_task(struct task_struct *p, bool sig_only);
-extern void cancel_freezing(struct task_struct *p);
 
 #ifdef CONFIG_CGROUP_FREEZER
 extern int cgroup_freezing_or_frozen(struct task_struct *task);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b8b562124ba9..11e32d419dec 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -129,21 +129,6 @@ out_unlock:
 	return ret;
 }
 
-void cancel_freezing(struct task_struct *p)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&freezer_lock, flags);
-	if (freezing(p)) {
-		pr_debug("  clean up: %s\n", p->comm);
-		clear_freeze_flag(p);
-		spin_lock(&p->sighand->siglock);
-		recalc_sigpending_and_wake(p);
-		spin_unlock(&p->sighand->siglock);
-	}
-	spin_unlock_irqrestore(&freezer_lock, flags);
-}
-
 void __thaw_task(struct task_struct *p)
 {
 	unsigned long flags;
@@ -153,10 +138,18 @@ void __thaw_task(struct task_struct *p)
 	 * be visible to @p as waking up implies wmb.  Waking up inside
 	 * freezer_lock also prevents wakeups from leaking outside
 	 * refrigerator.
+	 *
+	 * If !FROZEN, @p hasn't reached refrigerator, recalc sigpending to
+	 * avoid leaving dangling TIF_SIGPENDING behind.
 	 */
 	spin_lock_irqsave(&freezer_lock, flags);
 	clear_freeze_flag(p);
-	if (frozen(p))
+	if (frozen(p)) {
 		wake_up_process(p);
+	} else {
+		spin_lock(&p->sighand->siglock);
+		recalc_sigpending_and_wake(p);
+		spin_unlock(&p->sighand->siglock);
+	}
 	spin_unlock_irqrestore(&freezer_lock, flags);
 }
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 196c01268ebd..ba2319ffc860 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -607,17 +607,6 @@ static void power_down(void)
 	while(1);
 }
 
-static int prepare_processes(void)
-{
-	int error = 0;
-
-	if (freeze_processes()) {
-		error = -EBUSY;
-		thaw_processes();
-	}
-	return error;
-}
-
 /**
  * hibernate - Carry out system hibernation, including saving the image.
  */
@@ -650,7 +639,7 @@ int hibernate(void)
 	sys_sync();
 	printk("done.\n");
 
-	error = prepare_processes();
+	error = freeze_processes();
 	if (error)
 		goto Finish;
 
@@ -811,7 +800,7 @@ static int software_resume(void)
 		goto close_finish;
 
 	pr_debug("PM: Preparing processes for restore.\n");
-	error = prepare_processes();
+	error = freeze_processes();
 	if (error) {
 		swsusp_close(FMODE_READ);
 		goto Done;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e59676f5811d..ce643838a00c 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -91,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only)
 	elapsed_csecs = elapsed_csecs64;
 
 	if (todo) {
-		/* This does not unfreeze processes that are already frozen
-		 * (we have slightly ugly calling convention in that respect,
-		 * and caller must call thaw_processes() if something fails),
-		 * but it cleans up leftover PF_FREEZE requests.
-		 */
 		printk("\n");
 		printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
 		       "(%d tasks refusing to freeze, wq_busy=%d):\n",
@@ -103,14 +98,11 @@ static int try_to_freeze_tasks(bool sig_only)
 		       elapsed_csecs / 100, elapsed_csecs % 100,
 		       todo - wq_busy, wq_busy);
 
-		thaw_workqueues();
-
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			if (!wakeup && !freezer_should_skip(p) &&
 			    freezing(p) && !frozen(p))
 				sched_show_task(p);
-			cancel_freezing(p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 	} else {
@@ -123,6 +115,8 @@ static int try_to_freeze_tasks(bool sig_only)
 
 /**
  * freeze_processes - Signal user space processes to enter the refrigerator.
+ *
+ * On success, returns 0.  On failure, -errno and system is fully thawed.
  */
 int freeze_processes(void)
 {
@@ -137,11 +131,15 @@ int freeze_processes(void)
 	printk("\n");
 	BUG_ON(in_atomic());
 
+	if (error)
+		thaw_processes();
 	return error;
 }
 
 /**
  * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
+ *
+ * On success, returns 0.  On failure, -errno and system is fully thawed.
  */
 int freeze_kernel_threads(void)
 {
@@ -155,6 +153,8 @@ int freeze_kernel_threads(void)
 	printk("\n");
 	BUG_ON(in_atomic());
 
+	if (error)
+		thaw_processes();
 	return error;
 }
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4953dc054c53..d336b27d1104 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -106,13 +106,11 @@ static int suspend_prepare(void)
 		goto Finish;
 
 	error = suspend_freeze_processes();
-	if (error) {
-		suspend_stats.failed_freeze++;
-		dpm_save_failed_step(SUSPEND_FREEZE);
-	} else
+	if (!error)
 		return 0;
 
-	suspend_thaw_processes();
+	suspend_stats.failed_freeze++;
+	dpm_save_failed_step(SUSPEND_FREEZE);
 	usermodehelper_enable();
  Finish:
 	pm_notifier_call_chain(PM_POST_SUSPEND);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6d8f535c2b88..7cc3f5bc5c24 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -257,10 +257,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 			break;
 
 		error = freeze_processes();
-		if (error) {
-			thaw_processes();
+		if (error)
 			usermodehelper_enable();
-		}
 		if (!error)
 			data->frozen = 1;
 		break;
-- 
cgit v1.2.3-71-gd317


From 22b4e111fa01a1147aa562ceaf18a752a928ef4e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:25 -0800
Subject: cgroup_freezer: prepare for removal of TIF_FREEZE

TIF_FREEZE will be removed soon and freezing() will directly test
whether any freezing condition is in effect.  Make the following
changes in preparation.

* Rename cgroup_freezing_or_frozen() to cgroup_freezing() and make it
  return bool.

* Make cgroup_freezing() access task_freezer() under rcu read lock
  instead of task_lock().  This makes the state dereferencing racy
  against task moving to another cgroup; however, it was already racy
  without this change as ->state dereference wasn't synchronized.
  This will be later dealt with using attach hooks.

* freezer->state is now set before trying to push tasks into the
  target state.

-v2: Oleg pointed out that freeze_change_state() was setting
     freeze->state incorrectly to CGROUP_FROZEN instead of
     CGROUP_FREEZING.  Fixed.

-v3: Matt pointed out that setting CGROUP_FROZEN used to always invoke
     try_to_freeze_cgroup() regardless of the current state.  Patch
     updated such that the actual freeze/thaw operations are always
     performed on invocation.  This shouldn't make any difference
     unless something is broken.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Paul Menage <paul@paulmenage.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/freezer.h |  6 +++---
 kernel/cgroup_freezer.c | 40 +++++++++++++---------------------------
 kernel/power/process.c  |  2 +-
 3 files changed, 17 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 93f411a52872..b2b4abc5a739 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -63,11 +63,11 @@ static inline bool try_to_freeze(void)
 extern bool freeze_task(struct task_struct *p, bool sig_only);
 
 #ifdef CONFIG_CGROUP_FREEZER
-extern int cgroup_freezing_or_frozen(struct task_struct *task);
+extern bool cgroup_freezing(struct task_struct *task);
 #else /* !CONFIG_CGROUP_FREEZER */
-static inline int cgroup_freezing_or_frozen(struct task_struct *task)
+static inline bool cgroup_freezing(struct task_struct *task)
 {
-	return 0;
+	return false;
 }
 #endif /* !CONFIG_CGROUP_FREEZER */
 
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index cd27b0825560..e6a1b8d1b8bc 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task)
 			    struct freezer, css);
 }
 
-static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
+bool cgroup_freezing(struct task_struct *task)
 {
-	enum freezer_state state = task_freezer(task)->state;
-	return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
-}
+	enum freezer_state state;
+	bool ret;
 
-int cgroup_freezing_or_frozen(struct task_struct *task)
-{
-	int result;
-	task_lock(task);
-	result = __cgroup_freezing_or_frozen(task);
-	task_unlock(task);
-	return result;
+	rcu_read_lock();
+	state = task_freezer(task)->state;
+	ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
+	rcu_read_unlock();
+
+	return ret;
 }
 
 /*
@@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys;
  * freezer_can_attach():
  * cgroup_mutex (held by caller of can_attach)
  *
- * cgroup_freezing_or_frozen():
- * task->alloc_lock (to get task's cgroup)
- *
  * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
  * freezer->lock
  *  sighand->siglock (if the cgroup is freezing)
@@ -177,13 +172,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
 
 static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-	rcu_read_lock();
-	if (__cgroup_freezing_or_frozen(tsk)) {
-		rcu_read_unlock();
-		return -EBUSY;
-	}
-	rcu_read_unlock();
-	return 0;
+	return cgroup_freezing(tsk) ? -EBUSY : 0;
 }
 
 static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
@@ -279,7 +268,6 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 	struct task_struct *task;
 	unsigned int num_cant_freeze_now = 0;
 
-	freezer->state = CGROUP_FREEZING;
 	cgroup_iter_start(cgroup, &it);
 	while ((task = cgroup_iter_next(cgroup, &it))) {
 		if (!freeze_task(task, true))
@@ -303,8 +291,6 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 	while ((task = cgroup_iter_next(cgroup, &it)))
 		__thaw_task(task);
 	cgroup_iter_end(cgroup, &it);
-
-	freezer->state = CGROUP_THAWED;
 }
 
 static int freezer_change_state(struct cgroup *cgroup,
@@ -318,20 +304,20 @@ static int freezer_change_state(struct cgroup *cgroup,
 	spin_lock_irq(&freezer->lock);
 
 	update_if_frozen(cgroup, freezer);
-	if (goal_state == freezer->state)
-		goto out;
 
 	switch (goal_state) {
 	case CGROUP_THAWED:
+		freezer->state = CGROUP_THAWED;
 		unfreeze_cgroup(cgroup, freezer);
 		break;
 	case CGROUP_FROZEN:
+		freezer->state = CGROUP_FREEZING;
 		retval = try_to_freeze_cgroup(cgroup, freezer);
 		break;
 	default:
 		BUG();
 	}
-out:
+
 	spin_unlock_irq(&freezer->lock);
 
 	return retval;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ce643838a00c..9f6f5c755cfa 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -170,7 +170,7 @@ void thaw_processes(void)
 
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
-		if (cgroup_freezing_or_frozen(p))
+		if (cgroup_freezing(p))
 			continue;
 
 		__thaw_task(p);
-- 
cgit v1.2.3-71-gd317


From a3201227f803ad7fd43180c5195dbe5a2bf998aa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:25 -0800
Subject: freezer: make freezing() test freeze conditions in effect instead of
 TIF_FREEZE

Using TIF_FREEZE for freezing worked when there was only single
freezing condition (the PM one); however, now there is also the
cgroup_freezer and single bit flag is getting clumsy.
thaw_processes() is already testing whether cgroup freezing in in
effect to avoid thawing tasks which were frozen by both PM and cgroup
freezers.

This is racy (nothing prevents race against cgroup freezing) and
fragile.  A much simpler way is to test actual freeze conditions from
freezing() - ie. directly test whether PM or cgroup freezing is in
effect.

This patch adds variables to indicate whether and what type of
freezing conditions are in effect and reimplements freezing() such
that it directly tests whether any of the two freezing conditions is
active and the task should freeze.  On fast path, freezing() is still
very cheap - it only tests system_freezing_cnt.

This makes the clumsy dancing aroung TIF_FREEZE unnecessary and
freeze/thaw operations more usual - updating state variables for the
new state and nudging target tasks so that they notice the new state
and comply.  As long as the nudging happens after state update, it's
race-free.

* This allows use of freezing() in freeze_task().  Replace the open
  coded tests with freezing().

* p != current test is added to warning printing conditions in
  try_to_freeze_tasks() failure path.  This is necessary as freezing()
  is now true for the task which initiated freezing too.

-v2: Oleg pointed out that re-freezing FROZEN cgroup could increment
     system_freezing_cnt.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Paul Menage <paul@paulmenage.org>  (for the cgroup portions)
---
 include/linux/freezer.h | 33 ++++++++++----------------
 kernel/cgroup_freezer.c | 10 +++++++-
 kernel/fork.c           |  1 -
 kernel/freezer.c        | 62 +++++++++++++++++++++++++++++++------------------
 kernel/power/process.c  | 15 ++++++++----
 5 files changed, 72 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index b2b4abc5a739..8e29f2b7ce11 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -5,8 +5,13 @@
 
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/atomic.h>
 
 #ifdef CONFIG_FREEZER
+extern atomic_t system_freezing_cnt;	/* nr of freezing conds in effect */
+extern bool pm_freezing;		/* PM freezing in effect */
+extern bool pm_nosig_freezing;		/* PM nosig freezing in effect */
+
 /*
  * Check if a process has been frozen
  */
@@ -15,28 +20,16 @@ static inline int frozen(struct task_struct *p)
 	return p->flags & PF_FROZEN;
 }
 
-/*
- * Check if there is a request to freeze a process
- */
-static inline int freezing(struct task_struct *p)
-{
-	return test_tsk_thread_flag(p, TIF_FREEZE);
-}
+extern bool freezing_slow_path(struct task_struct *p);
 
 /*
- * Request that a process be frozen
- */
-static inline void set_freeze_flag(struct task_struct *p)
-{
-	set_tsk_thread_flag(p, TIF_FREEZE);
-}
-
-/*
- * Sometimes we may need to cancel the previous 'freeze' request
+ * Check if there is a request to freeze a process
  */
-static inline void clear_freeze_flag(struct task_struct *p)
+static inline bool freezing(struct task_struct *p)
 {
-	clear_tsk_thread_flag(p, TIF_FREEZE);
+	if (likely(!atomic_read(&system_freezing_cnt)))
+		return false;
+	return freezing_slow_path(p);
 }
 
 static inline bool should_send_signal(struct task_struct *p)
@@ -174,9 +167,7 @@ static inline void set_freezable_with_signal(void)
 })
 #else /* !CONFIG_FREEZER */
 static inline int frozen(struct task_struct *p) { return 0; }
-static inline int freezing(struct task_struct *p) { return 0; }
-static inline void set_freeze_flag(struct task_struct *p) {}
-static inline void clear_freeze_flag(struct task_struct *p) {}
+static inline bool freezing(struct task_struct *p) { return false; }
 
 static inline bool __refrigerator(bool check_kthr_stop) { return false; }
 static inline int freeze_processes(void) { return -ENOSYS; }
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e6a1b8d1b8bc..2327ad11725f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -145,7 +145,11 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
 static void freezer_destroy(struct cgroup_subsys *ss,
 			    struct cgroup *cgroup)
 {
-	kfree(cgroup_freezer(cgroup));
+	struct freezer *freezer = cgroup_freezer(cgroup);
+
+	if (freezer->state != CGROUP_THAWED)
+		atomic_dec(&system_freezing_cnt);
+	kfree(freezer);
 }
 
 /*
@@ -307,10 +311,14 @@ static int freezer_change_state(struct cgroup *cgroup,
 
 	switch (goal_state) {
 	case CGROUP_THAWED:
+		if (freezer->state != CGROUP_THAWED)
+			atomic_dec(&system_freezing_cnt);
 		freezer->state = CGROUP_THAWED;
 		unfreeze_cgroup(cgroup, freezer);
 		break;
 	case CGROUP_FROZEN:
+		if (freezer->state == CGROUP_THAWED)
+			atomic_inc(&system_freezing_cnt);
 		freezer->state = CGROUP_FREEZING;
 		retval = try_to_freeze_cgroup(cgroup, freezer);
 		break;
diff --git a/kernel/fork.c b/kernel/fork.c
index ba0d17261329..d53316e88d9d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -997,7 +997,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 	new_flags |= PF_FORKNOEXEC;
 	new_flags |= PF_STARTING;
 	p->flags = new_flags;
-	clear_freeze_flag(p);
 }
 
 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11e32d419dec..f53cd5aa5b2e 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -11,9 +11,41 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 
+/* total number of freezing conditions in effect */
+atomic_t system_freezing_cnt = ATOMIC_INIT(0);
+EXPORT_SYMBOL(system_freezing_cnt);
+
+/* indicate whether PM freezing is in effect, protected by pm_mutex */
+bool pm_freezing;
+bool pm_nosig_freezing;
+
 /* protects freezing and frozen transitions */
 static DEFINE_SPINLOCK(freezer_lock);
 
+/**
+ * freezing_slow_path - slow path for testing whether a task needs to be frozen
+ * @p: task to be tested
+ *
+ * This function is called by freezing() if system_freezing_cnt isn't zero
+ * and tests whether @p needs to enter and stay in frozen state.  Can be
+ * called under any context.  The freezers are responsible for ensuring the
+ * target tasks see the updated state.
+ */
+bool freezing_slow_path(struct task_struct *p)
+{
+	if (p->flags & PF_NOFREEZE)
+		return false;
+
+	if (pm_nosig_freezing || cgroup_freezing(p))
+		return true;
+
+	if (pm_freezing && !(p->flags & PF_FREEZER_NOSIG))
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(freezing_slow_path);
+
 /* Refrigerator is place where frozen processes are stored :-). */
 bool __refrigerator(bool check_kthr_stop)
 {
@@ -23,17 +55,11 @@ bool __refrigerator(bool check_kthr_stop)
 	long save;
 
 	/*
-	 * Enter FROZEN.  If NOFREEZE, schedule immediate thawing by
-	 * clearing freezing.
+	 * No point in checking freezing() again - the caller already did.
+	 * Proceed to enter FROZEN.
 	 */
 	spin_lock_irq(&freezer_lock);
 repeat:
-	if (!freezing(current)) {
-		spin_unlock_irq(&freezer_lock);
-		return was_frozen;
-	}
-	if (current->flags & PF_NOFREEZE)
-		clear_freeze_flag(current);
 	current->flags |= PF_FROZEN;
 	spin_unlock_irq(&freezer_lock);
 
@@ -99,18 +125,12 @@ static void fake_signal_wake_up(struct task_struct *p)
 bool freeze_task(struct task_struct *p, bool sig_only)
 {
 	unsigned long flags;
-	bool ret = false;
 
 	spin_lock_irqsave(&freezer_lock, flags);
-
-	if ((p->flags & PF_NOFREEZE) ||
-	    (sig_only && !should_send_signal(p)))
-		goto out_unlock;
-
-	if (frozen(p))
-		goto out_unlock;
-
-	set_freeze_flag(p);
+	if (!freezing(p) || frozen(p)) {
+		spin_unlock_irqrestore(&freezer_lock, flags);
+		return false;
+	}
 
 	if (should_send_signal(p)) {
 		fake_signal_wake_up(p);
@@ -123,10 +143,9 @@ bool freeze_task(struct task_struct *p, bool sig_only)
 	} else {
 		wake_up_state(p, TASK_INTERRUPTIBLE);
 	}
-	ret = true;
-out_unlock:
+
 	spin_unlock_irqrestore(&freezer_lock, flags);
-	return ret;
+	return true;
 }
 
 void __thaw_task(struct task_struct *p)
@@ -143,7 +162,6 @@ void __thaw_task(struct task_struct *p)
 	 * avoid leaving dangling TIF_SIGPENDING behind.
 	 */
 	spin_lock_irqsave(&freezer_lock, flags);
-	clear_freeze_flag(p);
 	if (frozen(p)) {
 		wake_up_process(p);
 	} else {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 9f6f5c755cfa..0beb51e1dec9 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -101,7 +101,7 @@ static int try_to_freeze_tasks(bool sig_only)
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			if (!wakeup && !freezer_should_skip(p) &&
-			    freezing(p) && !frozen(p))
+			    p != current && freezing(p) && !frozen(p))
 				sched_show_task(p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
@@ -122,7 +122,11 @@ int freeze_processes(void)
 {
 	int error;
 
+	if (!pm_freezing)
+		atomic_inc(&system_freezing_cnt);
+
 	printk("Freezing user space processes ... ");
+	pm_freezing = true;
 	error = try_to_freeze_tasks(true);
 	if (!error) {
 		printk("done.");
@@ -146,6 +150,7 @@ int freeze_kernel_threads(void)
 	int error;
 
 	printk("Freezing remaining freezable tasks ... ");
+	pm_nosig_freezing = true;
 	error = try_to_freeze_tasks(false);
 	if (!error)
 		printk("done.");
@@ -162,6 +167,11 @@ void thaw_processes(void)
 {
 	struct task_struct *g, *p;
 
+	if (pm_freezing)
+		atomic_dec(&system_freezing_cnt);
+	pm_freezing = false;
+	pm_nosig_freezing = false;
+
 	oom_killer_enable();
 
 	printk("Restarting tasks ... ");
@@ -170,9 +180,6 @@ void thaw_processes(void)
 
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
-		if (cgroup_freezing(p))
-			continue;
-
 		__thaw_task(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
-- 
cgit v1.2.3-71-gd317


From 948246f70a811c872b9d93bb4a8ab5823c4c79e0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:25 -0800
Subject: freezer: remove should_send_signal() and update frozen()

should_send_signal() is only used in freezer.c.  Exporting them only
increases chance of abuse.  Open code the two users and remove it.

Update frozen() to return bool.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/freezer.h | 9 ++-------
 kernel/freezer.c        | 2 +-
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 8e29f2b7ce11..3d50913d39d0 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -15,7 +15,7 @@ extern bool pm_nosig_freezing;		/* PM nosig freezing in effect */
 /*
  * Check if a process has been frozen
  */
-static inline int frozen(struct task_struct *p)
+static inline bool frozen(struct task_struct *p)
 {
 	return p->flags & PF_FROZEN;
 }
@@ -32,11 +32,6 @@ static inline bool freezing(struct task_struct *p)
 	return freezing_slow_path(p);
 }
 
-static inline bool should_send_signal(struct task_struct *p)
-{
-	return !(p->flags & PF_FREEZER_NOSIG);
-}
-
 /* Takes and releases task alloc lock using task_lock() */
 extern void __thaw_task(struct task_struct *t);
 
@@ -166,7 +161,7 @@ static inline void set_freezable_with_signal(void)
 	__retval;							\
 })
 #else /* !CONFIG_FREEZER */
-static inline int frozen(struct task_struct *p) { return 0; }
+static inline bool frozen(struct task_struct *p) { return false; }
 static inline bool freezing(struct task_struct *p) { return false; }
 
 static inline bool __refrigerator(bool check_kthr_stop) { return false; }
diff --git a/kernel/freezer.c b/kernel/freezer.c
index f53cd5aa5b2e..95a123844241 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -132,7 +132,7 @@ bool freeze_task(struct task_struct *p, bool sig_only)
 		return false;
 	}
 
-	if (should_send_signal(p)) {
+	if (!(p->flags & PF_FREEZER_NOSIG)) {
 		fake_signal_wake_up(p);
 		/*
 		 * fake_signal_wake_up() goes through p's scheduler
-- 
cgit v1.2.3-71-gd317


From 96ee6d8539c9fc6742908d85eb9723abb5c91854 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:25 -0800
Subject: freezer: fix set_freezable[_with_signal]() race

A kthread doing set_freezable*() may race with on-going PM freeze and
the freezer might think all tasks are frozen while the new freezable
kthread is merrily proceeding to execute code paths which aren't
supposed to be executing during PM freeze.

Reimplement set_freezable[_with_signal]() using __set_freezable() such
that freezable PF flags are modified under freezer_lock and
try_to_freeze() is called afterwards.  This eliminates race condition
against freezing.

Note: Separated out from larger patch to resolve fix order dependency
      Oleg pointed out.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/freezer.h |  9 +++++----
 kernel/freezer.c        | 25 +++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 3d50913d39d0..a0f1b3a3604f 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -49,6 +49,7 @@ static inline bool try_to_freeze(void)
 }
 
 extern bool freeze_task(struct task_struct *p, bool sig_only);
+extern bool __set_freezable(bool with_signal);
 
 #ifdef CONFIG_CGROUP_FREEZER
 extern bool cgroup_freezing(struct task_struct *task);
@@ -106,18 +107,18 @@ static inline int freezer_should_skip(struct task_struct *p)
 /*
  * Tell the freezer that the current task should be frozen by it
  */
-static inline void set_freezable(void)
+static inline bool set_freezable(void)
 {
-	current->flags &= ~PF_NOFREEZE;
+	return __set_freezable(false);
 }
 
 /*
  * Tell the freezer that the current task should be frozen by it and that it
  * should send a fake signal to the task to freeze it.
  */
-static inline void set_freezable_with_signal(void)
+static inline bool set_freezable_with_signal(void)
 {
-	current->flags &= ~(PF_NOFREEZE | PF_FREEZER_NOSIG);
+	return __set_freezable(true);
 }
 
 /*
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 95a123844241..b1e7a7b3d2cd 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -171,3 +171,28 @@ void __thaw_task(struct task_struct *p)
 	}
 	spin_unlock_irqrestore(&freezer_lock, flags);
 }
+
+/**
+ * __set_freezable - make %current freezable
+ * @with_signal: do we want %TIF_SIGPENDING for notification too?
+ *
+ * Mark %current freezable and enter refrigerator if necessary.
+ */
+bool __set_freezable(bool with_signal)
+{
+	might_sleep();
+
+	/*
+	 * Modify flags while holding freezer_lock.  This ensures the
+	 * freezer notices that we aren't frozen yet or the freezing
+	 * condition is visible to try_to_freeze() below.
+	 */
+	spin_lock_irq(&freezer_lock);
+	current->flags &= ~PF_NOFREEZE;
+	if (with_signal)
+		current->flags &= ~PF_FREEZER_NOSIG;
+	spin_unlock_irq(&freezer_lock);
+
+	return try_to_freeze();
+}
+EXPORT_SYMBOL(__set_freezable);
-- 
cgit v1.2.3-71-gd317


From 5ece3eae4cdb968c269e0bc7e2c0e2b223552025 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:26 -0800
Subject: freezer: restructure __refrigerator()

If another freeze happens before all tasks leave FROZEN state after
being thawed, the freezer can see the existing FROZEN and consider the
tasks to be frozen but they can clear FROZEN without checking the new
freezing().

Oleg suggested restructuring __refrigerator() such that there's single
condition check section inside freezer_lock and sigpending is cleared
afterwards, which fixes the problem and simplifies the code.
Restructure accordingly.

-v2: Frozen loop exited without releasing freezer_lock.  Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/freezer.c | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index b1e7a7b3d2cd..c3496c6a91b2 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -52,39 +52,29 @@ bool __refrigerator(bool check_kthr_stop)
 	/* Hmm, should we be allowed to suspend when there are realtime
 	   processes around? */
 	bool was_frozen = false;
-	long save;
+	long save = current->state;
 
-	/*
-	 * No point in checking freezing() again - the caller already did.
-	 * Proceed to enter FROZEN.
-	 */
-	spin_lock_irq(&freezer_lock);
-repeat:
-	current->flags |= PF_FROZEN;
-	spin_unlock_irq(&freezer_lock);
-
-	save = current->state;
 	pr_debug("%s entered refrigerator\n", current->comm);
 
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending(); /* We sent fake signal, clean it up */
-	spin_unlock_irq(&current->sighand->siglock);
-
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
+
+		spin_lock_irq(&freezer_lock);
+		current->flags |= PF_FROZEN;
 		if (!freezing(current) ||
 		    (check_kthr_stop && kthread_should_stop()))
+			current->flags &= ~PF_FROZEN;
+		spin_unlock_irq(&freezer_lock);
+
+		if (!(current->flags & PF_FROZEN))
 			break;
 		was_frozen = true;
 		schedule();
 	}
 
-	/* leave FROZEN */
-	spin_lock_irq(&freezer_lock);
-	if (freezing(current))
-		goto repeat;
-	current->flags &= ~PF_FROZEN;
-	spin_unlock_irq(&freezer_lock);
+	spin_lock_irq(&current->sighand->siglock);
+	recalc_sigpending(); /* We sent fake signal, clean it up */
+	spin_unlock_irq(&current->sighand->siglock);
 
 	pr_debug("%s left refrigerator\n", current->comm);
 
-- 
cgit v1.2.3-71-gd317


From 37ad8aca94a1da2112a7c56151390914e80d1113 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:26 -0800
Subject: freezer: use lock_task_sighand() in fake_signal_wake_up()

cgroup_freezer calls freeze_task() without holding tasklist_lock and,
if the task is exiting, its ->sighand may be gone by the time
fake_signal_wake_up() is called.  Use lock_task_sighand() instead of
accessing ->sighand directly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/freezer.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index c3496c6a91b2..389549f0a94e 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -93,9 +93,10 @@ static void fake_signal_wake_up(struct task_struct *p)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
-	signal_wake_up(p, 0);
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	if (lock_task_sighand(p, &flags)) {
+		signal_wake_up(p, 0);
+		unlock_task_sighand(p, &flags);
+	}
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From 839e3407d90a810318d17c17ceb3d5928a910704 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:26 -0800
Subject: freezer: remove unused @sig_only from freeze_task()

After "freezer: make freezing() test freeze conditions in effect
instead of TIF_FREEZE", freezing() returns authoritative answer on
whether the current task should freeze or not and freeze_task()
doesn't need or use @sig_only.  Remove it.

While at it, rewrite function comment for freeze_task() and rename
@sig_only to @user_only in try_to_freeze_tasks().

This patch doesn't cause any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/freezer.h |  2 +-
 kernel/cgroup_freezer.c |  4 ++--
 kernel/freezer.c        | 21 +++++++++------------
 kernel/power/process.c  |  8 ++++----
 4 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index a0f1b3a3604f..a28842e588f4 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -48,7 +48,7 @@ static inline bool try_to_freeze(void)
 	return __refrigerator(false);
 }
 
-extern bool freeze_task(struct task_struct *p, bool sig_only);
+extern bool freeze_task(struct task_struct *p);
 extern bool __set_freezable(bool with_signal);
 
 #ifdef CONFIG_CGROUP_FREEZER
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2327ad11725f..e411a60cc2c8 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -206,7 +206,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 
 	/* Locking avoids race with FREEZING -> THAWED transitions. */
 	if (freezer->state == CGROUP_FREEZING)
-		freeze_task(task, true);
+		freeze_task(task);
 	spin_unlock_irq(&freezer->lock);
 }
 
@@ -274,7 +274,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 
 	cgroup_iter_start(cgroup, &it);
 	while ((task = cgroup_iter_next(cgroup, &it))) {
-		if (!freeze_task(task, true))
+		if (!freeze_task(task))
 			continue;
 		if (frozen(task))
 			continue;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 389549f0a94e..2589a61de44c 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -100,20 +100,17 @@ static void fake_signal_wake_up(struct task_struct *p)
 }
 
 /**
- *	freeze_task - send a freeze request to given task
- *	@p: task to send the request to
- *	@sig_only: if set, the request will only be sent if the task has the
- *		PF_FREEZER_NOSIG flag unset
- *	Return value: 'false', if @sig_only is set and the task has
- *		PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
+ * freeze_task - send a freeze request to given task
+ * @p: task to send the request to
  *
- *	The freeze request is sent by setting the tasks's TIF_FREEZE flag and
- *	either sending a fake signal to it or waking it up, depending on whether
- *	or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
- *	has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
- *	TIF_FREEZE flag will not be set.
+ * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
+ * flag and either sending a fake signal to it or waking it up, depending
+ * on whether it has %PF_FREEZER_NOSIG set.
+ *
+ * RETURNS:
+ * %false, if @p is not freezing or already frozen; %true, otherwise
  */
-bool freeze_task(struct task_struct *p, bool sig_only)
+bool freeze_task(struct task_struct *p)
 {
 	unsigned long flags;
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0beb51e1dec9..77274c9ba2f1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
  */
 #define TIMEOUT	(20 * HZ)
 
-static int try_to_freeze_tasks(bool sig_only)
+static int try_to_freeze_tasks(bool user_only)
 {
 	struct task_struct *g, *p;
 	unsigned long end_time;
@@ -37,14 +37,14 @@ static int try_to_freeze_tasks(bool sig_only)
 
 	end_time = jiffies + TIMEOUT;
 
-	if (!sig_only)
+	if (!user_only)
 		freeze_workqueues_begin();
 
 	while (true) {
 		todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
-			if (p == current || !freeze_task(p, sig_only))
+			if (p == current || !freeze_task(p))
 				continue;
 
 			/*
@@ -65,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only)
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 
-		if (!sig_only) {
+		if (!user_only) {
 			wq_busy = freeze_workqueues_busy();
 			todo += wq_busy;
 		}
-- 
cgit v1.2.3-71-gd317


From 34b087e48367c252e343c2f8de65676a78af1e4a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Nov 2011 09:28:17 -0800
Subject: freezer: kill unused set_freezable_with_signal()

There's no in-kernel user of set_freezable_with_signal() left.  Mixing
TIF_SIGPENDING with kernel threads can lead to nasty corner cases as
kernel threads never travel signal delivery path on their own.

e.g. the current implementation is buggy in the cancelation path of
__thaw_task().  It calls recalc_sigpending_and_wake() in an attempt to
clear TIF_SIGPENDING but the function never clears it regardless of
sigpending state.  This means that signallable freezable kthreads may
continue executing with !freezing() && stuck TIF_SIGPENDING, which can
be troublesome.

This patch removes set_freezable_with_signal() along with
PF_FREEZER_NOSIG and recalc_sigpending*() calls in freezer.  User
tasks get TIF_SIGPENDING, kernel tasks get woken up and the spurious
sigpending is dealt with in the usual signal delivery path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/freezer.h | 20 +-------------------
 include/linux/sched.h   |  1 -
 kernel/freezer.c        | 27 ++++++---------------------
 kernel/kthread.c        |  2 +-
 4 files changed, 8 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index a28842e588f4..a33550fc05c5 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -49,7 +49,7 @@ static inline bool try_to_freeze(void)
 }
 
 extern bool freeze_task(struct task_struct *p);
-extern bool __set_freezable(bool with_signal);
+extern bool set_freezable(void);
 
 #ifdef CONFIG_CGROUP_FREEZER
 extern bool cgroup_freezing(struct task_struct *task);
@@ -104,23 +104,6 @@ static inline int freezer_should_skip(struct task_struct *p)
 	return !!(p->flags & PF_FREEZER_SKIP);
 }
 
-/*
- * Tell the freezer that the current task should be frozen by it
- */
-static inline bool set_freezable(void)
-{
-	return __set_freezable(false);
-}
-
-/*
- * Tell the freezer that the current task should be frozen by it and that it
- * should send a fake signal to the task to freeze it.
- */
-static inline bool set_freezable_with_signal(void)
-{
-	return __set_freezable(true);
-}
-
 /*
  * Freezer-friendly wrappers around wait_event_interruptible(),
  * wait_event_killable() and wait_event_interruptible_timeout(), originally
@@ -176,7 +159,6 @@ static inline void freezer_do_not_count(void) {}
 static inline void freezer_count(void) {}
 static inline int freezer_should_skip(struct task_struct *p) { return 0; }
 static inline void set_freezable(void) {}
-static inline void set_freezable_with_signal(void) {}
 
 #define wait_event_freezable(wq, condition)				\
 		wait_event_interruptible(wq, condition)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d12bd03b688f..2f90470ad843 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1788,7 +1788,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
-#define PF_FREEZER_NOSIG 0x80000000	/* Freezer won't send signals to it */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2589a61de44c..9815b8d1eed5 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -39,7 +39,7 @@ bool freezing_slow_path(struct task_struct *p)
 	if (pm_nosig_freezing || cgroup_freezing(p))
 		return true;
 
-	if (pm_freezing && !(p->flags & PF_FREEZER_NOSIG))
+	if (pm_freezing && !(p->flags & PF_KTHREAD))
 		return true;
 
 	return false;
@@ -72,10 +72,6 @@ bool __refrigerator(bool check_kthr_stop)
 		schedule();
 	}
 
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending(); /* We sent fake signal, clean it up */
-	spin_unlock_irq(&current->sighand->siglock);
-
 	pr_debug("%s left refrigerator\n", current->comm);
 
 	/*
@@ -120,7 +116,7 @@ bool freeze_task(struct task_struct *p)
 		return false;
 	}
 
-	if (!(p->flags & PF_FREEZER_NOSIG)) {
+	if (!(p->flags & PF_KTHREAD)) {
 		fake_signal_wake_up(p);
 		/*
 		 * fake_signal_wake_up() goes through p's scheduler
@@ -145,28 +141,19 @@ void __thaw_task(struct task_struct *p)
 	 * be visible to @p as waking up implies wmb.  Waking up inside
 	 * freezer_lock also prevents wakeups from leaking outside
 	 * refrigerator.
-	 *
-	 * If !FROZEN, @p hasn't reached refrigerator, recalc sigpending to
-	 * avoid leaving dangling TIF_SIGPENDING behind.
 	 */
 	spin_lock_irqsave(&freezer_lock, flags);
-	if (frozen(p)) {
+	if (frozen(p))
 		wake_up_process(p);
-	} else {
-		spin_lock(&p->sighand->siglock);
-		recalc_sigpending_and_wake(p);
-		spin_unlock(&p->sighand->siglock);
-	}
 	spin_unlock_irqrestore(&freezer_lock, flags);
 }
 
 /**
- * __set_freezable - make %current freezable
- * @with_signal: do we want %TIF_SIGPENDING for notification too?
+ * set_freezable - make %current freezable
  *
  * Mark %current freezable and enter refrigerator if necessary.
  */
-bool __set_freezable(bool with_signal)
+bool set_freezable(void)
 {
 	might_sleep();
 
@@ -177,10 +164,8 @@ bool __set_freezable(bool with_signal)
 	 */
 	spin_lock_irq(&freezer_lock);
 	current->flags &= ~PF_NOFREEZE;
-	if (with_signal)
-		current->flags &= ~PF_FREEZER_NOSIG;
 	spin_unlock_irq(&freezer_lock);
 
 	return try_to_freeze();
 }
-EXPORT_SYMBOL(__set_freezable);
+EXPORT_SYMBOL(set_freezable);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c36deaae2f1..3d3de633702e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -282,7 +282,7 @@ int kthreadd(void *unused)
 	set_cpus_allowed_ptr(tsk, cpu_all_mask);
 	set_mems_allowed(node_states[N_HIGH_MEMORY]);
 
-	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
+	current->flags |= PF_NOFREEZE;
 
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
cgit v1.2.3-71-gd317


From 341d4166175e9b7911444f5a33b1c9efb8f15c85 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Sat, 19 Nov 2011 14:39:01 +0100
Subject: PM: Fix indentation and remove extraneous whitespaces in
 kernel/power/main.c

Lack of proper indentation of the goto statement decreases the readability
of code significantly. In fact, this made me look twice at the code to check
whether it really does what it should be doing. Fix this.

And in the same file, there are some extra whitespaces. Get rid of them too.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 36e0f0903c32..7d36fb31e4c4 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2003 Patrick Mochel
  * Copyright (c) 2003 Open Source Development Lab
- * 
+ *
  * This file is released under the GPLv2
  *
  */
@@ -240,7 +240,7 @@ struct kobject *power_kobj;
  *	'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
  *	'disk' (Suspend-to-Disk).
  *
- *	store() accepts one of those strings, translates it into the 
+ *	store() accepts one of those strings, translates it into the
  *	proper enumerated value, and initiates a suspend transition.
  */
 static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 	/* First, check if we are requested to hibernate */
 	if (len == 4 && !strncmp(buf, "disk", len)) {
 		error = hibernate();
-  goto Exit;
+		goto Exit;
 	}
 
 #ifdef CONFIG_SUSPEND
-- 
cgit v1.2.3-71-gd317


From 953a206393b1533ceb0e7d725cc5a8c8d7ed97dd Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Tue, 22 Nov 2011 23:20:31 +0100
Subject: PM / Hibernate: Refactor and simplify hibernation_snapshot() code

The goto statements in hibernation_snapshot() are a bit complex.
Refactor the code to remove some of them, thereby simplifying the
implementation.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a6b0503574ee..ebf62c3bc9f7 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -333,7 +333,7 @@ static int create_image(int platform_mode)
  */
 int hibernation_snapshot(int platform_mode)
 {
-	pm_message_t msg = PMSG_RECOVER;
+	pm_message_t msg;
 	int error;
 
 	error = platform_begin(platform_mode);
@@ -362,26 +362,26 @@ int hibernation_snapshot(int platform_mode)
 
 	error = dpm_prepare(PMSG_FREEZE);
 	if (error) {
-		dpm_complete(msg);
+		dpm_complete(PMSG_RECOVER);
 		goto Cleanup;
 	}
 
 	suspend_console();
 	pm_restrict_gfp_mask();
+
 	error = dpm_suspend(PMSG_FREEZE);
-	if (error)
-		goto Recover_platform;
 
-	if (hibernation_test(TEST_DEVICES))
-		goto Recover_platform;
+	if (error || hibernation_test(TEST_DEVICES))
+		platform_recover(platform_mode);
+	else
+		error = create_image(platform_mode);
 
-	error = create_image(platform_mode);
 	/*
-	 * Control returns here (1) after the image has been created or the
+	 * In the case that we call create_image() above, the control
+	 * returns here (1) after the image has been created or the
 	 * image creation has failed and (2) after a successful restore.
 	 */
 
- Resume_devices:
 	/* We may need to release the preallocated image pages here. */
 	if (error || !in_suspend)
 		swsusp_free();
@@ -399,10 +399,6 @@ int hibernation_snapshot(int platform_mode)
 	platform_end(platform_mode);
 	return error;
 
- Recover_platform:
-	platform_recover(platform_mode);
-	goto Resume_devices;
-
  Cleanup:
 	swsusp_free();
 	goto Close;
-- 
cgit v1.2.3-71-gd317


From 5307427a31c50041cc4d18c49e9cce1a9303ea04 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Tue, 15 Nov 2011 21:59:21 +0100
Subject: PM / Usermodehelper: Cleanup remnants of usermodehelper_pm_callback()

usermodehelper_pm_callback() no longer exists in the kernel. There are 2
comments in kernel/kmod.c that still refer to it.

Also, the patch that introduced usermodehelper_pm_callback(), #included
two header files: <linux/notifier.h> and <linux/suspend.h>. But these are
no longer necessary.

This patch updates the comments as appropriate and removes the unnecessary
header file inclusions.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/kmod.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index a4bea97c75b6..2142687094d3 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -34,8 +34,6 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/resource.h>
-#include <linux/notifier.h>
-#include <linux/suspend.h>
 #include <asm/uaccess.h>
 
 #include <trace/events/module.h>
@@ -282,14 +280,14 @@ static int usermodehelper_disabled = 1;
 static atomic_t running_helpers = ATOMIC_INIT(0);
 
 /*
- * Wait queue head used by usermodehelper_pm_callback() to wait for all running
+ * Wait queue head used by usermodehelper_disable() to wait for all running
  * helpers to finish.
  */
 static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
 
 /*
  * Time to wait for running_helpers to become zero before the setting of
- * usermodehelper_disabled in usermodehelper_pm_callback() fails
+ * usermodehelper_disabled in usermodehelper_disable() fails
  */
 #define RUNNING_HELPERS_TIMEOUT	(5 * HZ)
 
-- 
cgit v1.2.3-71-gd317


From d7268a31c8aabc5a29ccc7f76de84383e5f38737 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 15 Nov 2011 21:59:31 +0100
Subject: CPU: Add right qualifiers for alloc_frozen_cpus() and
 cpu_hotplug_pm_sync_init()

Add __init for functions alloc_frozen_cpus() and cpu_hotplug_pm_sync_init()
because they are only called during boot time.

Add static for function cpu_hotplug_pm_sync_init() because its scope is limited
in this file only.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 563f13609470..cf915b86a5fb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -470,7 +470,7 @@ out:
 	cpu_maps_update_done();
 }
 
-static int alloc_frozen_cpus(void)
+static int __init alloc_frozen_cpus(void)
 {
 	if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
 		return -ENOMEM;
@@ -543,7 +543,7 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
 }
 
 
-int cpu_hotplug_pm_sync_init(void)
+static int __init cpu_hotplug_pm_sync_init(void)
 {
 	pm_notifier(cpu_hotplug_pm_callback, 0);
 	return 0;
-- 
cgit v1.2.3-71-gd317


From a13b032776379fa6e2bfccf798969ca51e5fb052 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Thu, 24 Nov 2011 12:27:26 +0100
Subject: clockevents: drop unknown Kconfig symbol GENERIC_CLOCKEVENTS_MIGR

There's no Kconfig symbol GENERIC_CLOCKEVENTS_MIGR, so the check for it
will always fail.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/time/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b26c2228fe92..2cf9cc7aa103 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -25,7 +25,7 @@ config HIGH_RES_TIMERS
 config GENERIC_CLOCKEVENTS_BUILD
 	bool
 	default y
-	depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
+	depends on GENERIC_CLOCKEVENTS
 
 config GENERIC_CLOCKEVENTS_MIN_ADJUST
 	bool
-- 
cgit v1.2.3-71-gd317


From 55af77969fbd7a841838220ea2287432e0da8ae5 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Tue, 29 Nov 2011 15:08:36 +0900
Subject: x86: Panic on detection of stack overflow

Currently, messages are just output on the detection of stack
overflow, which is not sufficient for systems that need a
high reliability. This is because in general the overflow may
corrupt data, and the additional corruption may occur due to
reading them unless systems stop.

This patch adds the sysctl parameter
kernel.panic_on_stackoverflow and causes a panic when detecting
the overflows of kernel, IRQ and exception stacks except user
stack according to the parameter. It is disabled by default.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/sysctl/kernel.txt | 14 ++++++++++++++
 arch/x86/kernel/irq_32.c        |  2 ++
 arch/x86/kernel/irq_64.c        |  5 +++++
 include/linux/kernel.h          |  1 +
 kernel/sysctl.c                 |  9 +++++++++
 5 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 1f2463671a1a..6d8cd8b2c30d 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -49,6 +49,7 @@ show up in /proc/sys/kernel:
 - panic
 - panic_on_oops
 - panic_on_unrecovered_nmi
+- panic_on_stackoverflow
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -393,6 +394,19 @@ Controls the kernel's behaviour when an oops or BUG is encountered.
 
 ==============================================================
 
+panic_on_stackoverflow:
+
+Controls the kernel's behavior when detecting the overflows of
+kernel, IRQ and exception stacks except a user stack.
+This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
+
+0: try to continue operation.
+
+1: panic immediately.
+
+==============================================================
+
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 72090705a656..e16e99ebd7ad 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -43,6 +43,8 @@ static void print_stack_overflow(void)
 {
 	printk(KERN_WARNING "low stack detected by irq handler\n");
 	dump_stack();
+	if (sysctl_panic_on_stackoverflow)
+		panic("low stack detected by irq handler - check messages\n");
 }
 
 #else
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 928a7e909619..42552b0dce6a 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
 DEFINE_PER_CPU(struct pt_regs *, irq_regs);
 EXPORT_PER_CPU_SYMBOL(irq_regs);
 
+int sysctl_panic_on_stackoverflow;
+
 /*
  * Probabilistic stack overflow check:
  *
@@ -65,6 +67,9 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 		current->comm, curbase, regs->sp,
 		irq_stack_top, irq_stack_bottom,
 		estack_top, estack_bottom);
+
+	if (sysctl_panic_on_stackoverflow)
+		panic("low stack detected by irq handler - check messages\n");
 #endif
 }
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e8b1597b5cf2..ff83683c0b9d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -341,6 +341,7 @@ extern int panic_timeout;
 extern int panic_on_oops;
 extern int panic_on_unrecovered_nmi;
 extern int panic_on_io_nmi;
+extern int sysctl_panic_on_stackoverflow;
 extern const char *print_tainted(void);
 extern void add_taint(unsigned flag);
 extern int test_taint(unsigned flag);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae2719643854..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	{
+		.procname	= "panic_on_stackoverflow",
+		.data		= &sysctl_panic_on_stackoverflow,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{
 		.procname	= "bootloader_type",
 		.data		= &bootloader_type,
-- 
cgit v1.2.3-71-gd317


From 0118521cc7acb3ccbc1a01d6144ac32be9d56a4c Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Thu, 1 Dec 2011 22:32:43 +0100
Subject: PM / Hibernate: Enable usermodehelpers in software_resume() error
 path

In the software_resume() function defined in kernel/power/hibernate.c,
if the call to create_basic_memory_bitmaps() fails, the usermodehelpers
are not enabled (which had been disabled in the previous step). Fix it.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index ebf62c3bc9f7..1fcf9de4506d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -807,8 +807,10 @@ static int software_resume(void)
 		goto close_finish;
 
 	error = create_basic_memory_bitmaps();
-	if (error)
+	if (error) {
+		usermodehelper_enable();
 		goto close_finish;
+	}
 
 	pr_debug("PM: Preparing processes for restore.\n");
 	error = prepare_processes();
-- 
cgit v1.2.3-71-gd317


From 97819a26224f019e73d88bb2fd4eb5a614860461 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Thu, 1 Dec 2011 22:33:10 +0100
Subject: PM / Hibernate: Thaw processes in SNAPSHOT_CREATE_IMAGE ioctl test
 path

Commit 2aede851ddf08666f68ffc17be446420e9d2a056 (PM / Hibernate: Freeze
kernel threads after preallocating memory) moved the freezing of kernel
threads to hibernation_snapshot() function.

So now, if the call to hibernation_snapshot() returns early due to a
successful hibernation test, the caller has to thaw processes to ensure
that the system gets back to its original state.

But in SNAPSHOT_CREATE_IMAGE hibernation ioctl, the caller does not thaw
processes in case hibernation_snapshot() returned due to a successful
freezer test. Fix this issue. But note we still send the value of 'in_suspend'
(which is now 0) to userspace, because we are not in an error path per-se,
and moreover, the value of in_suspend correctly depicts the situation here.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c |  2 +-
 kernel/power/power.h     |  2 ++
 kernel/power/user.c      | 11 ++++++++---
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1fcf9de4506d..c10cb0f916e2 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -55,7 +55,7 @@ enum {
 
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
 
-static bool freezer_test_done;
+bool freezer_test_done;
 
 static const struct platform_hibernation_ops *hibernation_ops;
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 23a2db1ec442..0c4defe6d3b8 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 #define SPARE_PAGES	((1024 * 1024) >> PAGE_SHIFT)
 
 /* kernel/power/hibernate.c */
+extern bool freezer_test_done;
+
 extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6d8f535c2b88..e2aff0fc2697 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -283,10 +283,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		}
 		pm_restore_gfp_mask();
 		error = hibernation_snapshot(data->platform_support);
-		if (!error)
+		if (!error) {
 			error = put_user(in_suspend, (int __user *)arg);
-		if (!error)
-			data->ready = 1;
+			if (!error && !freezer_test_done)
+				data->ready = 1;
+			if (freezer_test_done) {
+				freezer_test_done = false;
+				thaw_processes();
+			}
+		}
 		break;
 
 	case SNAPSHOT_ATOMIC_RESTORE:
-- 
cgit v1.2.3-71-gd317


From 48580ab8729865c81e148d59159fbe2aa7865511 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Thu, 1 Dec 2011 22:33:20 +0100
Subject: PM / Hibernate: Remove deprecated hibernation test modes

The hibernation test modes 'test' and 'testproc' are deprecated, because
the 'pm_test' framework offers much more fine-grained control for debugging
suspend and hibernation related problems.

So, remove the deprecated 'test' and 'testproc' hibernation test modes.

Suggested-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 33 ++++-----------------------------
 1 file changed, 4 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c10cb0f916e2..5314a94a92c1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -43,8 +43,6 @@ int in_suspend __nosavedata;
 enum {
 	HIBERNATION_INVALID,
 	HIBERNATION_PLATFORM,
-	HIBERNATION_TEST,
-	HIBERNATION_TESTPROC,
 	HIBERNATION_SHUTDOWN,
 	HIBERNATION_REBOOT,
 	/* keep last */
@@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void)
 	mdelay(5000);
 }
 
-static int hibernation_testmode(int mode)
-{
-	if (hibernation_mode == mode) {
-		hibernation_debug_sleep();
-		return 1;
-	}
-	return 0;
-}
-
 static int hibernation_test(int level)
 {
 	if (pm_test_level == level) {
@@ -114,7 +103,6 @@ static int hibernation_test(int level)
 	return 0;
 }
 #else /* !CONFIG_PM_DEBUG */
-static int hibernation_testmode(int mode) { return 0; }
 static int hibernation_test(int level) { return 0; }
 #endif /* !CONFIG_PM_DEBUG */
 
@@ -278,8 +266,7 @@ static int create_image(int platform_mode)
 		goto Platform_finish;
 
 	error = disable_nonboot_cpus();
-	if (error || hibernation_test(TEST_CPUS)
-	    || hibernation_testmode(HIBERNATION_TEST))
+	if (error || hibernation_test(TEST_CPUS))
 		goto Enable_cpus;
 
 	local_irq_disable();
@@ -349,8 +336,7 @@ int hibernation_snapshot(int platform_mode)
 	if (error)
 		goto Cleanup;
 
-	if (hibernation_test(TEST_FREEZER) ||
-		hibernation_testmode(HIBERNATION_TESTPROC)) {
+	if (hibernation_test(TEST_FREEZER)) {
 
 		/*
 		 * Indicate to the caller that we are returning due to a
@@ -586,9 +572,6 @@ int hibernation_platform_enter(void)
 static void power_down(void)
 {
 	switch (hibernation_mode) {
-	case HIBERNATION_TEST:
-	case HIBERNATION_TESTPROC:
-		break;
 	case HIBERNATION_REBOOT:
 		kernel_restart(NULL);
 		break;
@@ -853,8 +836,6 @@ static const char * const hibernation_modes[] = {
 	[HIBERNATION_PLATFORM]	= "platform",
 	[HIBERNATION_SHUTDOWN]	= "shutdown",
 	[HIBERNATION_REBOOT]	= "reboot",
-	[HIBERNATION_TEST]	= "test",
-	[HIBERNATION_TESTPROC]	= "testproc",
 };
 
 /*
@@ -863,17 +844,15 @@ static const char * const hibernation_modes[] = {
  * Hibernation can be handled in several ways.  There are a few different ways
  * to put the system into the sleep state: using the platform driver (e.g. ACPI
  * or other hibernation_ops), powering it off or rebooting it (for testing
- * mostly), or using one of the two available test modes.
+ * mostly).
  *
  * The sysfs file /sys/power/disk provides an interface for selecting the
  * hibernation mode to use.  Reading from this file causes the available modes
- * to be printed.  There are 5 modes that can be supported:
+ * to be printed.  There are 3 modes that can be supported:
  *
  *	'platform'
  *	'shutdown'
  *	'reboot'
- *	'test'
- *	'testproc'
  *
  * If a platform hibernation driver is in use, 'platform' will be supported
  * and will be used by default.  Otherwise, 'shutdown' will be used by default.
@@ -897,8 +876,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
 		switch (i) {
 		case HIBERNATION_SHUTDOWN:
 		case HIBERNATION_REBOOT:
-		case HIBERNATION_TEST:
-		case HIBERNATION_TESTPROC:
 			break;
 		case HIBERNATION_PLATFORM:
 			if (hibernation_ops)
@@ -939,8 +916,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 		switch (mode) {
 		case HIBERNATION_SHUTDOWN:
 		case HIBERNATION_REBOOT:
-		case HIBERNATION_TEST:
-		case HIBERNATION_TESTPROC:
 			hibernation_mode = mode;
 			break;
 		case HIBERNATION_PLATFORM:
-- 
cgit v1.2.3-71-gd317


From e5b16746f0f2d6883c226af52d90904ce0f7eee8 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Sat, 3 Dec 2011 00:20:30 +0100
Subject: PM / Hibernate: Replace unintuitive 'if' condition in
 kernel/power/user.c with 'else'

In the snapshot_ioctl() function, under SNAPSHOT_FREEZE, the code below
freeze_processes() is a bit unintuitive. Improve it by replacing the
second 'if' condition with an 'else' clause.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index c202e2e1a2d5..06ea33df8560 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -259,7 +259,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		error = freeze_processes();
 		if (error)
 			usermodehelper_enable();
-		if (!error)
+		else
 			data->frozen = 1;
 		break;
 
-- 
cgit v1.2.3-71-gd317


From bcda53faf5814c0c6025a0bd47108adfcbe9f199 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Wed, 7 Dec 2011 22:29:54 +0100
Subject: PM / Sleep: Replace mutex_[un]lock(&pm_mutex) with
 [un]lock_system_sleep()

Using [un]lock_system_sleep() is safer than directly using mutex_[un]lock()
on 'pm_mutex', since the latter could lead to freezing failures. Hence convert
all the present users of mutex_[un]lock(&pm_mutex) to use these safe APIs
instead.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Reviewed-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/kexec.c           |  4 ++--
 kernel/power/hibernate.c | 16 ++++++++--------
 kernel/power/main.c      |  4 ++--
 kernel/power/suspend.c   |  4 ++--
 kernel/power/user.c      | 16 ++++++++--------
 5 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index dc7bc0829286..090ee10d9604 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1523,7 +1523,7 @@ int kernel_kexec(void)
 
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
-		mutex_lock(&pm_mutex);
+		lock_system_sleep();
 		pm_prepare_console();
 		error = freeze_processes();
 		if (error) {
@@ -1576,7 +1576,7 @@ int kernel_kexec(void)
 		thaw_processes();
  Restore_console:
 		pm_restore_console();
-		mutex_unlock(&pm_mutex);
+		unlock_system_sleep();
 	}
 #endif
 
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 605149a6d219..6d6d28870335 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -69,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
 		WARN_ON(1);
 		return;
 	}
-	mutex_lock(&pm_mutex);
+	lock_system_sleep();
 	hibernation_ops = ops;
 	if (ops)
 		hibernation_mode = HIBERNATION_PLATFORM;
 	else if (hibernation_mode == HIBERNATION_PLATFORM)
 		hibernation_mode = HIBERNATION_SHUTDOWN;
 
-	mutex_unlock(&pm_mutex);
+	unlock_system_sleep();
 }
 
 static bool entering_platform_hibernation;
@@ -597,7 +597,7 @@ int hibernate(void)
 {
 	int error;
 
-	mutex_lock(&pm_mutex);
+	lock_system_sleep();
 	/* The snapshot device should not be opened while we're running */
 	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
 		error = -EBUSY;
@@ -665,7 +665,7 @@ int hibernate(void)
 	pm_restore_console();
 	atomic_inc(&snapshot_device_available);
  Unlock:
-	mutex_unlock(&pm_mutex);
+	unlock_system_sleep();
 	return error;
 }
 
@@ -893,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
-	mutex_lock(&pm_mutex);
+	lock_system_sleep();
 	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
 		if (len == strlen(hibernation_modes[i])
 		    && !strncmp(buf, hibernation_modes[i], len)) {
@@ -919,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 	if (!error)
 		pr_debug("PM: Hibernation mode set to '%s'\n",
 			 hibernation_modes[mode]);
-	mutex_unlock(&pm_mutex);
+	unlock_system_sleep();
 	return error ? error : n;
 }
 
@@ -946,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
 	if (maj != MAJOR(res) || min != MINOR(res))
 		goto out;
 
-	mutex_lock(&pm_mutex);
+	lock_system_sleep();
 	swsusp_resume_device = res;
-	mutex_unlock(&pm_mutex);
+	unlock_system_sleep();
 	printk(KERN_INFO "PM: Starting manual resume from disk\n");
 	noresume = 0;
 	software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7d36fb31e4c4..9824b41e5a18 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
 	p = memchr(buf, '\n', n);
 	len = p ? p - buf : n;
 
-	mutex_lock(&pm_mutex);
+	lock_system_sleep();
 
 	level = TEST_FIRST;
 	for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
 			break;
 		}
 
-	mutex_unlock(&pm_mutex);
+	unlock_system_sleep();
 
 	return error ? error : n;
 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index d336b27d1104..4fd51beed879 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops;
  */
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
-	mutex_lock(&pm_mutex);
+	lock_system_sleep();
 	suspend_ops = ops;
-	mutex_unlock(&pm_mutex);
+	unlock_system_sleep();
 }
 EXPORT_SYMBOL_GPL(suspend_set_ops);
 
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 06ea33df8560..98ade217da6c 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -71,7 +71,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 	struct snapshot_data *data;
 	int error;
 
-	mutex_lock(&pm_mutex);
+	lock_system_sleep();
 
 	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
 		error = -EBUSY;
@@ -123,7 +123,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 	data->platform_support = 0;
 
  Unlock:
-	mutex_unlock(&pm_mutex);
+	unlock_system_sleep();
 
 	return error;
 }
@@ -132,7 +132,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 {
 	struct snapshot_data *data;
 
-	mutex_lock(&pm_mutex);
+	lock_system_sleep();
 
 	swsusp_free();
 	free_basic_memory_bitmaps();
@@ -146,7 +146,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 			PM_POST_HIBERNATION : PM_POST_RESTORE);
 	atomic_inc(&snapshot_device_available);
 
-	mutex_unlock(&pm_mutex);
+	unlock_system_sleep();
 
 	return 0;
 }
@@ -158,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 	ssize_t res;
 	loff_t pg_offp = *offp & ~PAGE_MASK;
 
-	mutex_lock(&pm_mutex);
+	lock_system_sleep();
 
 	data = filp->private_data;
 	if (!data->ready) {
@@ -179,7 +179,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 		*offp += res;
 
  Unlock:
-	mutex_unlock(&pm_mutex);
+	unlock_system_sleep();
 
 	return res;
 }
@@ -191,7 +191,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 	ssize_t res;
 	loff_t pg_offp = *offp & ~PAGE_MASK;
 
-	mutex_lock(&pm_mutex);
+	lock_system_sleep();
 
 	data = filp->private_data;
 
@@ -208,7 +208,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 	if (res > 0)
 		*offp += res;
 unlock:
-	mutex_unlock(&pm_mutex);
+	unlock_system_sleep();
 
 	return res;
 }
-- 
cgit v1.2.3-71-gd317


From b298d289c79211508f11cb50749b0d1d54eb244a Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Fri, 9 Dec 2011 23:36:36 +0100
Subject: PM / Sleep: Fix freezer failures due to racy
 usermodehelper_is_disabled()

Commit a144c6a (PM: Print a warning if firmware is requested when tasks
are frozen) introduced usermodehelper_is_disabled() to warn and exit
immediately if firmware is requested when usermodehelpers are disabled.

However, it is racy. Consider the following scenario, currently used in
drivers/base/firmware_class.c:

...
if (usermodehelper_is_disabled())
        goto out;

/* Do actual work */
...

out:
        return err;

Nothing prevents someone from disabling usermodehelpers just after the check
in the 'if' condition, which means that it is quite possible to try doing the
"actual work" with usermodehelpers disabled, leading to undesirable
consequences.

In particular, this race condition in _request_firmware() causes task freezing
failures whenever suspend/hibernation is in progress because, it wrongly waits
to get the firmware/microcode image from userspace when actually the
usermodehelpers are disabled or userspace has been frozen.
Some of the example scenarios that cause freezing failures due to this race
are those that depend on userspace via request_firmware(), such as x86
microcode module initialization and microcode image reload.

Previous discussions about this issue can be found at:
http://thread.gmane.org/gmane.linux.kernel/1198291/focus=1200591

This patch adds proper synchronization to fix this issue.

It is to be noted that this patchset fixes the freezing failures but doesn't
remove the warnings. IOW, it does not attempt to add explicit synchronization
to x86 microcode driver to avoid requesting microcode image at inopportune
moments. Because, the warnings were introduced to highlight such cases, in the
first place. And we need not silence the warnings, since we take care of the
*real* problem (freezing failure) and hence, after that, the warnings are
pretty harmless anyway.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/firmware_class.c |  4 ++++
 include/linux/kmod.h          |  2 ++
 kernel/kmod.c                 | 23 ++++++++++++++++++++++-
 3 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 06ed6b4e7df5..d5585da14c8a 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -534,6 +534,8 @@ static int _request_firmware(const struct firmware **firmware_p,
 		return 0;
 	}
 
+	read_lock_usermodehelper();
+
 	if (WARN_ON(usermodehelper_is_disabled())) {
 		dev_err(device, "firmware: %s will not be loaded\n", name);
 		retval = -EBUSY;
@@ -572,6 +574,8 @@ static int _request_firmware(const struct firmware **firmware_p,
 	fw_destroy_instance(fw_priv);
 
 out:
+	read_unlock_usermodehelper();
+
 	if (retval) {
 		release_firmware(firmware);
 		*firmware_p = NULL;
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index b16f65390734..722f477c4ef7 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -117,5 +117,7 @@ extern void usermodehelper_init(void);
 extern int usermodehelper_disable(void);
 extern void usermodehelper_enable(void);
 extern bool usermodehelper_is_disabled(void);
+extern void read_lock_usermodehelper(void);
+extern void read_unlock_usermodehelper(void);
 
 #endif /* __LINUX_KMOD_H__ */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a4bea97c75b6..81b4a27261b2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,6 +36,7 @@
 #include <linux/resource.h>
 #include <linux/notifier.h>
 #include <linux/suspend.h>
+#include <linux/rwsem.h>
 #include <asm/uaccess.h>
 
 #include <trace/events/module.h>
@@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq;
 static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
 static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
 static DEFINE_SPINLOCK(umh_sysctl_lock);
+static DECLARE_RWSEM(umhelper_sem);
 
 #ifdef CONFIG_MODULES
 
@@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work)
  * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
  * (used for preventing user land processes from being created after the user
  * land has been frozen during a system-wide hibernation or suspend operation).
+ * Should always be manipulated under umhelper_sem acquired for write.
  */
 static int usermodehelper_disabled = 1;
 
@@ -293,6 +296,18 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
  */
 #define RUNNING_HELPERS_TIMEOUT	(5 * HZ)
 
+void read_lock_usermodehelper(void)
+{
+	down_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
+
+void read_unlock_usermodehelper(void)
+{
+	up_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
+
 /**
  * usermodehelper_disable - prevent new helpers from being started
  */
@@ -300,8 +315,10 @@ int usermodehelper_disable(void)
 {
 	long retval;
 
+	down_write(&umhelper_sem);
 	usermodehelper_disabled = 1;
-	smp_mb();
+	up_write(&umhelper_sem);
+
 	/*
 	 * From now on call_usermodehelper_exec() won't start any new
 	 * helpers, so it is sufficient if running_helpers turns out to
@@ -314,7 +331,9 @@ int usermodehelper_disable(void)
 	if (retval)
 		return 0;
 
+	down_write(&umhelper_sem);
 	usermodehelper_disabled = 0;
+	up_write(&umhelper_sem);
 	return -EAGAIN;
 }
 
@@ -323,7 +342,9 @@ int usermodehelper_disable(void)
  */
 void usermodehelper_enable(void)
 {
+	down_write(&umhelper_sem);
 	usermodehelper_disabled = 0;
+	up_write(&umhelper_sem);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From cf007e3526a785a95a738d5a8fba44f1f4fe33e0 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Thu, 8 Dec 2011 23:42:53 +0100
Subject: PM / Hibernate: Remove deprecated hibernation snapshot ioctls

Several snapshot ioctls were marked for removal quite some time ago,
since they were deprecated. Remove them.

Suggested-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/feature-removal-schedule.txt | 11 ----
 kernel/power/user.c                        | 87 ------------------------------
 2 files changed, 98 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 3d849122b5b1..9f51fc439a81 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -85,17 +85,6 @@ Who:	Robin Getz <rgetz@blackfin.uclinux.org> & Matt Mackall <mpm@selenic.com>
 
 ---------------------------
 
-What:	Deprecated snapshot ioctls
-When:	2.6.36
-
-Why:	The ioctls in kernel/power/user.c were marked as deprecated long time
-	ago. Now they notify users about that so that they need to replace
-	their userspace. After some more time, remove them completely.
-
-Who:	Jiri Slaby <jirislaby@gmail.com>
-
----------------------------
-
 What:	The ieee80211_regdom module parameter
 When:	March 2010 / desktop catchup
 
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98ade217da6c..78bdb4404aab 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -30,28 +30,6 @@
 
 #include "power.h"
 
-/*
- * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
- * will be removed in the future.  They are only preserved here for
- * compatibility with existing userland utilities.
- */
-#define SNAPSHOT_SET_SWAP_FILE	_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
-#define SNAPSHOT_PMOPS		_IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
-
-#define PMOPS_PREPARE	1
-#define PMOPS_ENTER	2
-#define PMOPS_FINISH	3
-
-/*
- * NOTE: The following ioctl definitions are wrong and have been replaced with
- * correct ones.  They are only preserved here for compatibility with existing
- * userland utilities and will be removed in the future.
- */
-#define SNAPSHOT_ATOMIC_SNAPSHOT	_IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
-#define SNAPSHOT_SET_IMAGE_SIZE		_IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
-#define SNAPSHOT_AVAIL_SWAP		_IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
-#define SNAPSHOT_GET_SWAP_PAGE		_IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
-
 
 #define SNAPSHOT_MINOR	231
 
@@ -213,15 +191,6 @@ unlock:
 	return res;
 }
 
-static void snapshot_deprecated_ioctl(unsigned int cmd)
-{
-	if (printk_ratelimit())
-		printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
-				"be removed soon, update your suspend-to-disk "
-				"utilities\n",
-				__builtin_return_address(0), cmd);
-}
-
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 							unsigned long arg)
 {
@@ -272,8 +241,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		data->frozen = 0;
 		break;
 
-	case SNAPSHOT_ATOMIC_SNAPSHOT:
-		snapshot_deprecated_ioctl(cmd);
 	case SNAPSHOT_CREATE_IMAGE:
 		if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
 			error = -EPERM;
@@ -308,8 +275,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		data->ready = 0;
 		break;
 
-	case SNAPSHOT_SET_IMAGE_SIZE:
-		snapshot_deprecated_ioctl(cmd);
 	case SNAPSHOT_PREF_IMAGE_SIZE:
 		image_size = arg;
 		break;
@@ -324,16 +289,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		error = put_user(size, (loff_t __user *)arg);
 		break;
 
-	case SNAPSHOT_AVAIL_SWAP:
-		snapshot_deprecated_ioctl(cmd);
 	case SNAPSHOT_AVAIL_SWAP_SIZE:
 		size = count_swap_pages(data->swap, 1);
 		size <<= PAGE_SHIFT;
 		error = put_user(size, (loff_t __user *)arg);
 		break;
 
-	case SNAPSHOT_GET_SWAP_PAGE:
-		snapshot_deprecated_ioctl(cmd);
 	case SNAPSHOT_ALLOC_SWAP_PAGE:
 		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
 			error = -ENODEV;
@@ -356,27 +317,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		free_all_swap_pages(data->swap);
 		break;
 
-	case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
-		snapshot_deprecated_ioctl(cmd);
-		if (!swsusp_swap_in_use()) {
-			/*
-			 * User space encodes device types as two-byte values,
-			 * so we need to recode them
-			 */
-			if (old_decode_dev(arg)) {
-				data->swap = swap_type_of(old_decode_dev(arg),
-							0, NULL);
-				if (data->swap < 0)
-					error = -ENODEV;
-			} else {
-				data->swap = -1;
-				error = -EINVAL;
-			}
-		} else {
-			error = -EPERM;
-		}
-		break;
-
 	case SNAPSHOT_S2RAM:
 		if (!data->frozen) {
 			error = -EPERM;
@@ -399,33 +339,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 			error = hibernation_platform_enter();
 		break;
 
-	case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
-		snapshot_deprecated_ioctl(cmd);
-		error = -EINVAL;
-
-		switch (arg) {
-
-		case PMOPS_PREPARE:
-			data->platform_support = 1;
-			error = 0;
-			break;
-
-		case PMOPS_ENTER:
-			if (data->platform_support)
-				error = hibernation_platform_enter();
-			break;
-
-		case PMOPS_FINISH:
-			if (data->platform_support)
-				error = 0;
-			break;
-
-		default:
-			printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
-
-		}
-		break;
-
 	case SNAPSHOT_SET_SWAP_AREA:
 		if (swsusp_swap_in_use()) {
 			error = -EPERM;
-- 
cgit v1.2.3-71-gd317


From e25e2cbb4c6679bed5f52fb0f2cc381688297901 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 12 Dec 2011 18:12:21 -0800
Subject: cgroup: add cgroup_root_mutex

cgroup wants to make threadgroup stable while modifying cgroup
hierarchies which will introduce locking dependency on
cred_guard_mutex from cgroup_mutex.  This unfortunately completes
circular dependency.

 A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
 B. namespace_sem -> cgroup_mutex

B is from cgroup_show_options() and this patch breaks it by
introducing another mutex cgroup_root_mutex which nests inside
cgroup_mutex and protects cgroupfs_root.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 kernel/cgroup.c | 64 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d9d5648f3cdc..6545fd61b10d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
 
 #include <linux/atomic.h>
 
+/*
+ * cgroup_mutex is the master lock.  Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
+ * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
+ * release_agent_path and so on.  Modifying requires both cgroup_mutex and
+ * cgroup_root_mutex.  Readers can acquire either of the two.  This is to
+ * break the following locking order cycle.
+ *
+ *  A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
+ *  B. namespace_sem -> cgroup_mutex
+ *
+ * B happens only through cgroup_show_options() and using cgroup_root_mutex
+ * breaks it.
+ */
 static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_MUTEX(cgroup_root_mutex);
 
 /*
  * Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	int i;
 
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
+	BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
 
 	removed_bits = root->actual_subsys_bits & ~final_bits;
 	added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1043,7 +1061,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
 	struct cgroup_subsys *ss;
 
-	mutex_lock(&cgroup_mutex);
+	mutex_lock(&cgroup_root_mutex);
 	for_each_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
 	if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
 		seq_printf(seq, ",name=%s", root->name);
-	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_root_mutex);
 	return 0;
 }
 
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
+	mutex_lock(&cgroup_root_mutex);
 
 	/* See what subsystems are wanted */
 	ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
+	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	int ret = 0;
 	struct super_block *sb;
 	struct cgroupfs_root *new_root;
+	struct inode *inode;
 
 	/* First find the desired set of subsystems */
 	mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		/* We used the new root structure, so this is a new hierarchy */
 		struct list_head tmp_cg_links;
 		struct cgroup *root_cgrp = &root->top_cgroup;
-		struct inode *inode;
 		struct cgroupfs_root *existing_root;
 		const struct cred *cred;
 		int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
+		mutex_lock(&cgroup_root_mutex);
 
-		if (strlen(root->name)) {
-			/* Check for name clashes with existing mounts */
-			for_each_active_root(existing_root) {
-				if (!strcmp(existing_root->name, root->name)) {
-					ret = -EBUSY;
-					mutex_unlock(&cgroup_mutex);
-					mutex_unlock(&inode->i_mutex);
-					goto drop_new_super;
-				}
-			}
-		}
+		/* Check for name clashes with existing mounts */
+		ret = -EBUSY;
+		if (strlen(root->name))
+			for_each_active_root(existing_root)
+				if (!strcmp(existing_root->name, root->name))
+					goto unlock_drop;
 
 		/*
 		 * We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 * have some link structures left over
 		 */
 		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
-		if (ret) {
-			mutex_unlock(&cgroup_mutex);
-			mutex_unlock(&inode->i_mutex);
-			goto drop_new_super;
-		}
+		if (ret)
+			goto unlock_drop;
 
 		ret = rebind_subsystems(root, root->subsys_bits);
 		if (ret == -EBUSY) {
-			mutex_unlock(&cgroup_mutex);
-			mutex_unlock(&inode->i_mutex);
 			free_cg_links(&tmp_cg_links);
-			goto drop_new_super;
+			goto unlock_drop;
 		}
 		/*
 		 * There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		cred = override_creds(&init_cred);
 		cgroup_populate_dir(root_cgrp);
 		revert_creds(cred);
+		mutex_unlock(&cgroup_root_mutex);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
 	} else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	kfree(opts.name);
 	return dget(sb->s_root);
 
+ unlock_drop:
+	mutex_unlock(&cgroup_root_mutex);
+	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&inode->i_mutex);
  drop_new_super:
 	deactivate_locked_super(sb);
  drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	BUG_ON(!list_empty(&cgrp->sibling));
 
 	mutex_lock(&cgroup_mutex);
+	mutex_lock(&cgroup_root_mutex);
 
 	/* Rebind all subsystems back to the default hierarchy */
 	ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
 		root_count--;
 	}
 
+	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 
 	kill_litter_super(sb);
@@ -2311,7 +2329,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 		return -EINVAL;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
+	mutex_lock(&cgroup_root_mutex);
 	strcpy(cgrp->root->release_agent_path, buffer);
+	mutex_unlock(&cgroup_root_mutex);
 	cgroup_unlock();
 	return 0;
 }
-- 
cgit v1.2.3-71-gd317


From 257058ae2b971646b96ab3a15605ac69186e562a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 12 Dec 2011 18:12:21 -0800
Subject: threadgroup: rename signal->threadgroup_fork_lock to ->group_rwsem

Make the following renames to prepare for extension of threadgroup
locking.

* s/signal->threadgroup_fork_lock/signal->group_rwsem/
* s/threadgroup_fork_read_lock()/threadgroup_change_begin()/
* s/threadgroup_fork_read_unlock()/threadgroup_change_end()/
* s/threadgroup_fork_write_lock()/threadgroup_lock()/
* s/threadgroup_fork_write_unlock()/threadgroup_unlock()/

This patch doesn't cause any behavior change.

-v2: Rename threadgroup_change_done() to threadgroup_change_end() per
     KAMEZAWA's suggestion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
---
 include/linux/init_task.h |  9 ++++-----
 include/linux/sched.h     | 30 +++++++++++++++---------------
 kernel/cgroup.c           | 13 ++++++-------
 kernel/fork.c             |  8 ++++----
 4 files changed, 29 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 94b1e356c02a..f4544b99efe4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -23,11 +23,10 @@ extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 
 #ifdef CONFIG_CGROUPS
-#define INIT_THREADGROUP_FORK_LOCK(sig)					\
-	.threadgroup_fork_lock =					\
-		__RWSEM_INITIALIZER(sig.threadgroup_fork_lock),
+#define INIT_GROUP_RWSEM(sig)						\
+	.group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
 #else
-#define INIT_THREADGROUP_FORK_LOCK(sig)
+#define INIT_GROUP_RWSEM(sig)
 #endif
 
 #define INIT_SIGNALS(sig) {						\
@@ -46,7 +45,7 @@ extern struct fs_struct init_fs;
 	},								\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
-	INIT_THREADGROUP_FORK_LOCK(sig)					\
+	INIT_GROUP_RWSEM(sig)						\
 }
 
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d81cce933869..8cd523202a3b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -635,13 +635,13 @@ struct signal_struct {
 #endif
 #ifdef CONFIG_CGROUPS
 	/*
-	 * The threadgroup_fork_lock prevents threads from forking with
+	 * The group_rwsem prevents threads from forking with
 	 * CLONE_THREAD while held for writing. Use this for fork-sensitive
 	 * threadgroup-wide operations. It's taken for reading in fork.c in
 	 * copy_process().
 	 * Currently only needed write-side by cgroups.
 	 */
-	struct rw_semaphore threadgroup_fork_lock;
+	struct rw_semaphore group_rwsem;
 #endif
 
 	int oom_adj;		/* OOM kill score adjustment (bit shift) */
@@ -2371,29 +2371,29 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
 	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
 }
 
-/* See the declaration of threadgroup_fork_lock in signal_struct. */
+/* See the declaration of group_rwsem in signal_struct. */
 #ifdef CONFIG_CGROUPS
-static inline void threadgroup_fork_read_lock(struct task_struct *tsk)
+static inline void threadgroup_change_begin(struct task_struct *tsk)
 {
-	down_read(&tsk->signal->threadgroup_fork_lock);
+	down_read(&tsk->signal->group_rwsem);
 }
-static inline void threadgroup_fork_read_unlock(struct task_struct *tsk)
+static inline void threadgroup_change_end(struct task_struct *tsk)
 {
-	up_read(&tsk->signal->threadgroup_fork_lock);
+	up_read(&tsk->signal->group_rwsem);
 }
-static inline void threadgroup_fork_write_lock(struct task_struct *tsk)
+static inline void threadgroup_lock(struct task_struct *tsk)
 {
-	down_write(&tsk->signal->threadgroup_fork_lock);
+	down_write(&tsk->signal->group_rwsem);
 }
-static inline void threadgroup_fork_write_unlock(struct task_struct *tsk)
+static inline void threadgroup_unlock(struct task_struct *tsk)
 {
-	up_write(&tsk->signal->threadgroup_fork_lock);
+	up_write(&tsk->signal->group_rwsem);
 }
 #else
-static inline void threadgroup_fork_read_lock(struct task_struct *tsk) {}
-static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) {}
-static inline void threadgroup_fork_write_lock(struct task_struct *tsk) {}
-static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {}
+static inline void threadgroup_change_begin(struct task_struct *tsk) {}
+static inline void threadgroup_change_end(struct task_struct *tsk) {}
+static inline void threadgroup_lock(struct task_struct *tsk) {}
+static inline void threadgroup_unlock(struct task_struct *tsk) {}
 #endif
 
 #ifndef __HAVE_THREAD_FUNCTIONS
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6545fd61b10d..b409df3b2e9d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2003,8 +2003,8 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
  * @cgrp: the cgroup to attach to
  * @leader: the threadgroup leader task_struct of the group to be attached
  *
- * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
- * take task_lock of each thread in leader's threadgroup individually in turn.
+ * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
+ * task_lock of each thread in leader's threadgroup individually in turn.
  */
 int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 {
@@ -2030,8 +2030,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 * step 0: in order to do expensive, possibly blocking operations for
 	 * every thread, we cannot iterate the thread group list, since it needs
 	 * rcu or tasklist locked. instead, build an array of all threads in the
-	 * group - threadgroup_fork_lock prevents new threads from appearing,
-	 * and if threads exit, this will just be an over-estimate.
+	 * group - group_rwsem prevents new threads from appearing, and if
+	 * threads exit, this will just be an over-estimate.
 	 */
 	group_size = get_nr_threads(leader);
 	/* flex_array supports very large thread-groups better than kmalloc. */
@@ -2249,7 +2249,6 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 			cgroup_unlock();
 			return -ESRCH;
 		}
-
 		/*
 		 * even if we're attaching all tasks in the thread group, we
 		 * only need to check permissions on one of them.
@@ -2273,9 +2272,9 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 	}
 
 	if (threadgroup) {
-		threadgroup_fork_write_lock(tsk);
+		threadgroup_lock(tsk);
 		ret = cgroup_attach_proc(cgrp, tsk);
-		threadgroup_fork_write_unlock(tsk);
+		threadgroup_unlock(tsk);
 	} else {
 		ret = cgroup_attach_task(cgrp, tsk);
 	}
diff --git a/kernel/fork.c b/kernel/fork.c
index 827808613847..d4ac9e3e0075 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -972,7 +972,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sched_autogroup_fork(sig);
 
 #ifdef CONFIG_CGROUPS
-	init_rwsem(&sig->threadgroup_fork_lock);
+	init_rwsem(&sig->group_rwsem);
 #endif
 
 	sig->oom_adj = current->signal->oom_adj;
@@ -1157,7 +1157,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	if (clone_flags & CLONE_THREAD)
-		threadgroup_fork_read_lock(current);
+		threadgroup_change_begin(current);
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
@@ -1372,7 +1372,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	if (clone_flags & CLONE_THREAD)
-		threadgroup_fork_read_unlock(current);
+		threadgroup_change_end(current);
 	perf_event_fork(p);
 	return p;
 
@@ -1407,7 +1407,7 @@ bad_fork_cleanup_policy:
 bad_fork_cleanup_cgroup:
 #endif
 	if (clone_flags & CLONE_THREAD)
-		threadgroup_fork_read_unlock(current);
+		threadgroup_change_end(current);
 	cgroup_exit(p, cgroup_callbacks_done);
 	delayacct_tsk_free(p);
 	module_put(task_thread_info(p)->exec_domain->module);
-- 
cgit v1.2.3-71-gd317


From 77e4ef99d1c596a31747668e5fd837f77b6349b6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 12 Dec 2011 18:12:21 -0800
Subject: threadgroup: extend threadgroup_lock() to cover exit and exec

threadgroup_lock() protected only protected against new addition to
the threadgroup, which was inherently somewhat incomplete and
problematic for its only user cgroup.  On-going migration could race
against exec and exit leading to interesting problems - the symmetry
between various attach methods, task exiting during method execution,
->exit() racing against attach methods, migrating task switching basic
properties during exec and so on.

This patch extends threadgroup_lock() such that it protects against
all three threadgroup altering operations - fork, exit and exec.  For
exit, threadgroup_change_begin/end() calls are added to exit_signals
around assertion of PF_EXITING.  For exec, threadgroup_[un]lock() are
updated to also grab and release cred_guard_mutex.

With this change, threadgroup_lock() guarantees that the target
threadgroup will remain stable - no new task will be added, no new
PF_EXITING will be set and exec won't happen.

The next patch will update cgroup so that it can take full advantage
of this change.

-v2: beefed up comment as suggested by Frederic.

-v3: narrowed scope of protection in exit path as suggested by
     Frederic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 47 +++++++++++++++++++++++++++++++++++++++++------
 kernel/signal.c       | 10 ++++++++++
 2 files changed, 51 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8cd523202a3b..c0c5876c52c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -635,11 +635,13 @@ struct signal_struct {
 #endif
 #ifdef CONFIG_CGROUPS
 	/*
-	 * The group_rwsem prevents threads from forking with
-	 * CLONE_THREAD while held for writing. Use this for fork-sensitive
-	 * threadgroup-wide operations. It's taken for reading in fork.c in
-	 * copy_process().
-	 * Currently only needed write-side by cgroups.
+	 * group_rwsem prevents new tasks from entering the threadgroup and
+	 * member tasks from exiting,a more specifically, setting of
+	 * PF_EXITING.  fork and exit paths are protected with this rwsem
+	 * using threadgroup_change_begin/end().  Users which require
+	 * threadgroup to remain stable should use threadgroup_[un]lock()
+	 * which also takes care of exec path.  Currently, cgroup is the
+	 * only user.
 	 */
 	struct rw_semaphore group_rwsem;
 #endif
@@ -2371,7 +2373,6 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
 	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
 }
 
-/* See the declaration of group_rwsem in signal_struct. */
 #ifdef CONFIG_CGROUPS
 static inline void threadgroup_change_begin(struct task_struct *tsk)
 {
@@ -2381,13 +2382,47 @@ static inline void threadgroup_change_end(struct task_struct *tsk)
 {
 	up_read(&tsk->signal->group_rwsem);
 }
+
+/**
+ * threadgroup_lock - lock threadgroup
+ * @tsk: member task of the threadgroup to lock
+ *
+ * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
+ * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
+ * perform exec.  This is useful for cases where the threadgroup needs to
+ * stay stable across blockable operations.
+ *
+ * fork and exit paths explicitly call threadgroup_change_{begin|end}() for
+ * synchronization.  While held, no new task will be added to threadgroup
+ * and no existing live task will have its PF_EXITING set.
+ *
+ * During exec, a task goes and puts its thread group through unusual
+ * changes.  After de-threading, exclusive access is assumed to resources
+ * which are usually shared by tasks in the same group - e.g. sighand may
+ * be replaced with a new one.  Also, the exec'ing task takes over group
+ * leader role including its pid.  Exclude these changes while locked by
+ * grabbing cred_guard_mutex which is used to synchronize exec path.
+ */
 static inline void threadgroup_lock(struct task_struct *tsk)
 {
+	/*
+	 * exec uses exit for de-threading nesting group_rwsem inside
+	 * cred_guard_mutex. Grab cred_guard_mutex first.
+	 */
+	mutex_lock(&tsk->signal->cred_guard_mutex);
 	down_write(&tsk->signal->group_rwsem);
 }
+
+/**
+ * threadgroup_unlock - unlock threadgroup
+ * @tsk: member task of the threadgroup to unlock
+ *
+ * Reverse threadgroup_lock().
+ */
 static inline void threadgroup_unlock(struct task_struct *tsk)
 {
 	up_write(&tsk->signal->group_rwsem);
+	mutex_unlock(&tsk->signal->cred_guard_mutex);
 }
 #else
 static inline void threadgroup_change_begin(struct task_struct *tsk) {}
diff --git a/kernel/signal.c b/kernel/signal.c
index b3f78d09a105..399c184bf0ae 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2359,8 +2359,15 @@ void exit_signals(struct task_struct *tsk)
 	int group_stop = 0;
 	sigset_t unblocked;
 
+	/*
+	 * @tsk is about to have PF_EXITING set - lock out users which
+	 * expect stable threadgroup.
+	 */
+	threadgroup_change_begin(tsk);
+
 	if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
 		tsk->flags |= PF_EXITING;
+		threadgroup_change_end(tsk);
 		return;
 	}
 
@@ -2370,6 +2377,9 @@ void exit_signals(struct task_struct *tsk)
 	 * see wants_signal(), do_signal_stop().
 	 */
 	tsk->flags |= PF_EXITING;
+
+	threadgroup_change_end(tsk);
+
 	if (!signal_pending(tsk))
 		goto out;
 
-- 
cgit v1.2.3-71-gd317


From cd3d095275374220921fcf0d4e0c16584b26ddbc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 12 Dec 2011 18:12:21 -0800
Subject: cgroup: always lock threadgroup during migration

Update cgroup to take advantage of the fack that threadgroup_lock()
guarantees stable threadgroup.

* Lock threadgroup even if the target is a single task.  This
  guarantees that when the target tasks stay stable during migration
  regardless of the target type.

* Remove PF_EXITING early exit optimization from attach_task_by_pid()
  and check it in cgroup_task_migrate() instead.  The optimization was
  for rather cold path to begin with and PF_EXITING state can be
  trusted throughout migration by checking it after locking
  threadgroup.

* Don't add PF_EXITING tasks to target task array in
  cgroup_attach_proc().  This ensures that task migration is performed
  only for live tasks.

* Remove -ESRCH failure path from cgroup_task_migrate().  With the
  above changes, it's guaranteed to be called only for live tasks.

After the changes, only live tasks are migrated and they're guaranteed
to stay alive until migration is complete.  This removes problems
caused by exec and exit racing against cgroup migration including
symmetry among cgroup attach methods and different cgroup methods
racing each other.

v2: Oleg pointed out that one more PF_EXITING check can be removed
    from cgroup_attach_proc().  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/cgroup.c | 62 +++++++++++++++++++++++++--------------------------------
 1 file changed, 27 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b409df3b2e9d..d71e012e81be 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1762,7 +1762,7 @@ EXPORT_SYMBOL_GPL(cgroup_path);
  *
  * 'guarantee' is set if the caller promises that a new css_set for the task
  * will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
  */
 static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 			       struct task_struct *tsk, bool guarantee)
@@ -1800,13 +1800,9 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 	}
 	put_css_set(oldcg);
 
-	/* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+	/* @tsk can't exit as its threadgroup is locked */
 	task_lock(tsk);
-	if (tsk->flags & PF_EXITING) {
-		task_unlock(tsk);
-		put_css_set(newcg);
-		return -ESRCH;
-	}
+	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	rcu_assign_pointer(tsk->cgroups, newcg);
 	task_unlock(tsk);
 
@@ -1832,8 +1828,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
  * @cgrp: the cgroup the task is attaching to
  * @tsk: the task to be attached
  *
- * Call holding cgroup_mutex. May take task_lock of
- * the task 'tsk' during call.
+ * Call with cgroup_mutex and threadgroup locked. May take task_lock of
+ * @tsk during call.
  */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
@@ -1842,6 +1838,10 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	struct cgroup *oldcgrp;
 	struct cgroupfs_root *root = cgrp->root;
 
+	/* @tsk either already exited or can't exit until the end */
+	if (tsk->flags & PF_EXITING)
+		return -ESRCH;
+
 	/* Nothing to do if the task is already in that cgroup */
 	oldcgrp = task_cgroup_from_root(tsk, root);
 	if (cgrp == oldcgrp)
@@ -2062,6 +2062,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	tsk = leader;
 	i = 0;
 	do {
+		/* @tsk either already exited or can't exit until the end */
+		if (tsk->flags & PF_EXITING)
+			continue;
+
 		/* as per above, nr_threads may decrease, but not increase. */
 		BUG_ON(i >= group_size);
 		get_task_struct(tsk);
@@ -2116,11 +2120,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 			continue;
 		/* get old css_set pointer */
 		task_lock(tsk);
-		if (tsk->flags & PF_EXITING) {
-			/* ignore this task if it's going away */
-			task_unlock(tsk);
-			continue;
-		}
 		oldcg = tsk->cgroups;
 		get_css_set(oldcg);
 		task_unlock(tsk);
@@ -2153,16 +2152,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		oldcgrp = task_cgroup_from_root(tsk, root);
 		if (cgrp == oldcgrp)
 			continue;
-		/* if the thread is PF_EXITING, it can just get skipped. */
 		retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
-		if (retval == 0) {
-			/* attach each task to each subsystem */
-			for_each_subsys(root, ss) {
-				if (ss->attach_task)
-					ss->attach_task(cgrp, tsk);
-			}
-		} else {
-			BUG_ON(retval != -ESRCH);
+		BUG_ON(retval);
+		/* attach each task to each subsystem */
+		for_each_subsys(root, ss) {
+			if (ss->attach_task)
+				ss->attach_task(cgrp, tsk);
 		}
 	}
 	/* nothing is sensitive to fork() after this point. */
@@ -2215,8 +2210,8 @@ out_free_group_list:
 
 /*
  * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will take
- * cgroup_mutex; may take task_lock of task.
+ * function to attach either it or all tasks in its threadgroup. Will lock
+ * cgroup_mutex and threadgroup; may take task_lock of task.
  */
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
@@ -2243,11 +2238,6 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 			 * detect it later.
 			 */
 			tsk = tsk->group_leader;
-		} else if (tsk->flags & PF_EXITING) {
-			/* optimization for the single-task-only case */
-			rcu_read_unlock();
-			cgroup_unlock();
-			return -ESRCH;
 		}
 		/*
 		 * even if we're attaching all tasks in the thread group, we
@@ -2271,13 +2261,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 		get_task_struct(tsk);
 	}
 
-	if (threadgroup) {
-		threadgroup_lock(tsk);
+	threadgroup_lock(tsk);
+
+	if (threadgroup)
 		ret = cgroup_attach_proc(cgrp, tsk);
-		threadgroup_unlock(tsk);
-	} else {
+	else
 		ret = cgroup_attach_task(cgrp, tsk);
-	}
+
+	threadgroup_unlock(tsk);
+
 	put_task_struct(tsk);
 	cgroup_unlock();
 	return ret;
-- 
cgit v1.2.3-71-gd317


From 134d33737f9015761c3832f6b268fae6274aac7f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 12 Dec 2011 18:12:21 -0800
Subject: cgroup: improve old cgroup handling in cgroup_attach_proc()

cgroup_attach_proc() behaves differently from cgroup_attach_task() in
the following aspects.

* All hooks are invoked even if no task is actually being moved.

* ->can_attach_task() is called for all tasks in the group whether the
  new cgrp is different from the current cgrp or not; however,
  ->attach_task() is skipped if new equals new.  This makes the calls
  asymmetric.

This patch improves old cgroup handling in cgroup_attach_proc() by
looking up the current cgroup at the head, recording it in the flex
array along with the task itself, and using it to remove the above two
differences.  This will also ease further changes.

-v2: nr_todo renamed to nr_migrating_tasks as per Paul Menage's
     suggestion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Menage <paul@paulmenage.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cgroup.c | 66 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d71e012e81be..0f2d00519d37 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1757,6 +1757,11 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
 
+struct task_and_cgroup {
+	struct task_struct	*task;
+	struct cgroup		*cgrp;
+};
+
 /*
  * cgroup_task_migrate - move a task from one cgroup to another.
  *
@@ -2008,15 +2013,15 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
  */
 int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 {
-	int retval, i, group_size;
+	int retval, i, group_size, nr_migrating_tasks;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
 	bool cancel_failed_ss = false;
 	/* guaranteed to be initialized later, but the compiler needs this */
-	struct cgroup *oldcgrp = NULL;
 	struct css_set *oldcg;
 	struct cgroupfs_root *root = cgrp->root;
 	/* threadgroup list cursor and array */
 	struct task_struct *tsk;
+	struct task_and_cgroup *tc;
 	struct flex_array *group;
 	/*
 	 * we need to make sure we have css_sets for all the tasks we're
@@ -2035,8 +2040,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 */
 	group_size = get_nr_threads(leader);
 	/* flex_array supports very large thread-groups better than kmalloc. */
-	group = flex_array_alloc(sizeof(struct task_struct *), group_size,
-				 GFP_KERNEL);
+	group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
 	if (!group)
 		return -ENOMEM;
 	/* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2060,8 +2064,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	}
 	/* take a reference on each task in the group to go in the array. */
 	tsk = leader;
-	i = 0;
+	i = nr_migrating_tasks = 0;
 	do {
+		struct task_and_cgroup ent;
+
 		/* @tsk either already exited or can't exit until the end */
 		if (tsk->flags & PF_EXITING)
 			continue;
@@ -2073,14 +2079,23 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		 * saying GFP_ATOMIC has no effect here because we did prealloc
 		 * earlier, but it's good form to communicate our expectations.
 		 */
-		retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+		ent.task = tsk;
+		ent.cgrp = task_cgroup_from_root(tsk, root);
+		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
 		BUG_ON(retval != 0);
 		i++;
+		if (ent.cgrp != cgrp)
+			nr_migrating_tasks++;
 	} while_each_thread(leader, tsk);
 	/* remember the number of threads in the array for later. */
 	group_size = i;
 	read_unlock(&tasklist_lock);
 
+	/* methods shouldn't be called if no task is actually migrating */
+	retval = 0;
+	if (!nr_migrating_tasks)
+		goto out_put_tasks;
+
 	/*
 	 * step 1: check that we can legitimately attach to the cgroup.
 	 */
@@ -2096,8 +2111,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		if (ss->can_attach_task) {
 			/* run on each task in the threadgroup. */
 			for (i = 0; i < group_size; i++) {
-				tsk = flex_array_get_ptr(group, i);
-				retval = ss->can_attach_task(cgrp, tsk);
+				tc = flex_array_get(group, i);
+				if (tc->cgrp == cgrp)
+					continue;
+				retval = ss->can_attach_task(cgrp, tc->task);
 				if (retval) {
 					failed_ss = ss;
 					cancel_failed_ss = true;
@@ -2113,18 +2130,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 */
 	INIT_LIST_HEAD(&newcg_list);
 	for (i = 0; i < group_size; i++) {
-		tsk = flex_array_get_ptr(group, i);
+		tc = flex_array_get(group, i);
 		/* nothing to do if this task is already in the cgroup */
-		oldcgrp = task_cgroup_from_root(tsk, root);
-		if (cgrp == oldcgrp)
+		if (tc->cgrp == cgrp)
 			continue;
 		/* get old css_set pointer */
-		task_lock(tsk);
-		oldcg = tsk->cgroups;
+		task_lock(tc->task);
+		oldcg = tc->task->cgroups;
 		get_css_set(oldcg);
-		task_unlock(tsk);
+		task_unlock(tc->task);
 		/* see if the new one for us is already in the list? */
-		if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+		if (css_set_check_fetched(cgrp, tc->task, oldcg, &newcg_list)) {
 			/* was already there, nothing to do. */
 			put_css_set(oldcg);
 		} else {
@@ -2147,17 +2163,16 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 			ss->pre_attach(cgrp);
 	}
 	for (i = 0; i < group_size; i++) {
-		tsk = flex_array_get_ptr(group, i);
+		tc = flex_array_get(group, i);
 		/* leave current thread as it is if it's already there */
-		oldcgrp = task_cgroup_from_root(tsk, root);
-		if (cgrp == oldcgrp)
+		if (tc->cgrp == cgrp)
 			continue;
-		retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+		retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
 		BUG_ON(retval);
 		/* attach each task to each subsystem */
 		for_each_subsys(root, ss) {
 			if (ss->attach_task)
-				ss->attach_task(cgrp, tsk);
+				ss->attach_task(cgrp, tc->task);
 		}
 	}
 	/* nothing is sensitive to fork() after this point. */
@@ -2168,8 +2183,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 * being moved, this call will need to be reworked to communicate that.
 	 */
 	for_each_subsys(root, ss) {
-		if (ss->attach)
-			ss->attach(ss, cgrp, oldcgrp, leader);
+		if (ss->attach) {
+			tc = flex_array_get(group, 0);
+			ss->attach(ss, cgrp, tc->cgrp, tc->task);
+		}
 	}
 
 	/*
@@ -2198,10 +2215,11 @@ out_cancel_attach:
 				ss->cancel_attach(ss, cgrp, leader);
 		}
 	}
+out_put_tasks:
 	/* clean up the array of referenced threads in the group. */
 	for (i = 0; i < group_size; i++) {
-		tsk = flex_array_get_ptr(group, i);
-		put_task_struct(tsk);
+		tc = flex_array_get(group, i);
+		put_task_struct(tc->task);
 	}
 out_free_group_list:
 	flex_array_free(group);
-- 
cgit v1.2.3-71-gd317


From 2f7ee5691eecb67c8108b92001a85563ea336ac5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 12 Dec 2011 18:12:21 -0800
Subject: cgroup: introduce cgroup_taskset and use it in subsys->can_attach(),
 cancel_attach() and attach()

Currently, there's no way to pass multiple tasks to cgroup_subsys
methods necessitating the need for separate per-process and per-task
methods.  This patch introduces cgroup_taskset which can be used to
pass multiple tasks and their associated cgroups to cgroup_subsys
methods.

Three methods - can_attach(), cancel_attach() and attach() - are
converted to use cgroup_taskset.  This unifies passed parameters so
that all methods have access to all information.  Conversions in this
patchset are identical and don't introduce any behavior change.

-v2: documentation updated as per Paul Menage's suggestion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Menage <paul@paulmenage.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: James Morris <jmorris@namei.org>
---
 Documentation/cgroups/cgroups.txt | 31 ++++++++----
 include/linux/cgroup.h            | 28 +++++++++--
 kernel/cgroup.c                   | 99 +++++++++++++++++++++++++++++++++++----
 kernel/cgroup_freezer.c           |  2 +-
 kernel/cpuset.c                   | 18 +++----
 mm/memcontrol.c                   | 16 +++----
 security/device_cgroup.c          |  7 +--
 7 files changed, 158 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 9c452ef2328c..8a2f302327fa 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -594,15 +594,25 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
 called multiple times against a cgroup.
 
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	       struct task_struct *task)
+	       struct cgroup_taskset *tset)
 (cgroup_mutex held by caller)
 
-Called prior to moving a task into a cgroup; if the subsystem
-returns an error, this will abort the attach operation.  If a NULL
-task is passed, then a successful result indicates that *any*
-unspecified task can be moved into the cgroup. Note that this isn't
-called on a fork. If this method returns 0 (success) then this should
-remain valid while the caller holds cgroup_mutex and it is ensured that either
+Called prior to moving one or more tasks into a cgroup; if the
+subsystem returns an error, this will abort the attach operation.
+@tset contains the tasks to be attached and is guaranteed to have at
+least one task in it.
+
+If there are multiple tasks in the taskset, then:
+  - it's guaranteed that all are from the same thread group
+  - @tset contains all tasks from the thread group whether or not
+    they're switching cgroups
+  - the first task is the leader
+
+Each @tset entry also contains the task's old cgroup and tasks which
+aren't switching cgroup can be skipped easily using the
+cgroup_taskset_for_each() iterator. Note that this isn't called on a
+fork. If this method returns 0 (success) then this should remain valid
+while the caller holds cgroup_mutex and it is ensured that either
 attach() or cancel_attach() will be called in future.
 
 int can_attach_task(struct cgroup *cgrp, struct task_struct *tsk);
@@ -613,14 +623,14 @@ attached (possibly many when using cgroup_attach_proc). Called after
 can_attach.
 
 void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	       struct task_struct *task, bool threadgroup)
+		   struct cgroup_taskset *tset)
 (cgroup_mutex held by caller)
 
 Called when a task attach operation has failed after can_attach() has succeeded.
 A subsystem whose can_attach() has some side-effects should provide this
 function, so that the subsystem can implement a rollback. If not, not necessary.
 This will be called only about subsystems whose can_attach() operation have
-succeeded.
+succeeded. The parameters are identical to can_attach().
 
 void pre_attach(struct cgroup *cgrp);
 (cgroup_mutex held by caller)
@@ -629,11 +639,12 @@ For any non-per-thread attachment work that needs to happen before
 attach_task. Needed by cpuset.
 
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	    struct cgroup *old_cgrp, struct task_struct *task)
+	    struct cgroup_taskset *tset)
 (cgroup_mutex held by caller)
 
 Called after the task has been attached to the cgroup, to allow any
 post-attachment activity that requires memory allocations or blocking.
+The parameters are identical to can_attach().
 
 void attach_task(struct cgroup *cgrp, struct task_struct *tsk);
 (cgroup_mutex held by caller)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1b7f9d525013..34256ad9e553 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -456,6 +456,28 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
 
+/*
+ * Control Group taskset, used to pass around set of tasks to cgroup_subsys
+ * methods.
+ */
+struct cgroup_taskset;
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
+struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset);
+int cgroup_taskset_size(struct cgroup_taskset *tset);
+
+/**
+ * cgroup_taskset_for_each - iterate cgroup_taskset
+ * @task: the loop cursor
+ * @skip_cgrp: skip if task's cgroup matches this, %NULL to iterate through all
+ * @tset: taskset to iterate
+ */
+#define cgroup_taskset_for_each(task, skip_cgrp, tset)			\
+	for ((task) = cgroup_taskset_first((tset)); (task);		\
+	     (task) = cgroup_taskset_next((tset)))			\
+		if (!(skip_cgrp) ||					\
+		    cgroup_taskset_cur_cgroup((tset)) != (skip_cgrp))
+
 /*
  * Control Group subsystem type.
  * See Documentation/cgroups/cgroups.txt for details
@@ -467,14 +489,14 @@ struct cgroup_subsys {
 	int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			  struct task_struct *tsk);
+			  struct cgroup_taskset *tset);
 	int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
 	void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			      struct task_struct *tsk);
+			      struct cgroup_taskset *tset);
 	void (*pre_attach)(struct cgroup *cgrp);
 	void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
 	void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-		       struct cgroup *old_cgrp, struct task_struct *tsk);
+		       struct cgroup_taskset *tset);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
 	void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			struct cgroup *old_cgrp, struct task_struct *task);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0f2d00519d37..41ee01e392e6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1757,11 +1757,85 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
 
+/*
+ * Control Group taskset
+ */
 struct task_and_cgroup {
 	struct task_struct	*task;
 	struct cgroup		*cgrp;
 };
 
+struct cgroup_taskset {
+	struct task_and_cgroup	single;
+	struct flex_array	*tc_array;
+	int			tc_array_len;
+	int			idx;
+	struct cgroup		*cur_cgrp;
+};
+
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
+{
+	if (tset->tc_array) {
+		tset->idx = 0;
+		return cgroup_taskset_next(tset);
+	} else {
+		tset->cur_cgrp = tset->single.cgrp;
+		return tset->single.task;
+	}
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
+
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ *
+ * Return the next task in @tset.  Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
+{
+	struct task_and_cgroup *tc;
+
+	if (!tset->tc_array || tset->idx >= tset->tc_array_len)
+		return NULL;
+
+	tc = flex_array_get(tset->tc_array, tset->idx++);
+	tset->cur_cgrp = tc->cgrp;
+	return tc->task;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
+
+/**
+ * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
+ * @tset: taskset of interest
+ *
+ * Return the cgroup for the current (last returned) task of @tset.  This
+ * function must be preceded by either cgroup_taskset_first() or
+ * cgroup_taskset_next().
+ */
+struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
+{
+	return tset->cur_cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
+
+/**
+ * cgroup_taskset_size - return the number of tasks in taskset
+ * @tset: taskset of interest
+ */
+int cgroup_taskset_size(struct cgroup_taskset *tset)
+{
+	return tset->tc_array ? tset->tc_array_len : 1;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_size);
+
+
 /*
  * cgroup_task_migrate - move a task from one cgroup to another.
  *
@@ -1842,6 +1916,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	struct cgroup_subsys *ss, *failed_ss = NULL;
 	struct cgroup *oldcgrp;
 	struct cgroupfs_root *root = cgrp->root;
+	struct cgroup_taskset tset = { };
 
 	/* @tsk either already exited or can't exit until the end */
 	if (tsk->flags & PF_EXITING)
@@ -1852,9 +1927,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	if (cgrp == oldcgrp)
 		return 0;
 
+	tset.single.task = tsk;
+	tset.single.cgrp = oldcgrp;
+
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
-			retval = ss->can_attach(ss, cgrp, tsk);
+			retval = ss->can_attach(ss, cgrp, &tset);
 			if (retval) {
 				/*
 				 * Remember on which subsystem the can_attach()
@@ -1885,7 +1963,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 		if (ss->attach_task)
 			ss->attach_task(cgrp, tsk);
 		if (ss->attach)
-			ss->attach(ss, cgrp, oldcgrp, tsk);
+			ss->attach(ss, cgrp, &tset);
 	}
 
 	synchronize_rcu();
@@ -1907,7 +1985,7 @@ out:
 				 */
 				break;
 			if (ss->cancel_attach)
-				ss->cancel_attach(ss, cgrp, tsk);
+				ss->cancel_attach(ss, cgrp, &tset);
 		}
 	}
 	return retval;
@@ -2023,6 +2101,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	struct task_struct *tsk;
 	struct task_and_cgroup *tc;
 	struct flex_array *group;
+	struct cgroup_taskset tset = { };
 	/*
 	 * we need to make sure we have css_sets for all the tasks we're
 	 * going to move -before- we actually start moving them, so that in
@@ -2089,6 +2168,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	} while_each_thread(leader, tsk);
 	/* remember the number of threads in the array for later. */
 	group_size = i;
+	tset.tc_array = group;
+	tset.tc_array_len = group_size;
 	read_unlock(&tasklist_lock);
 
 	/* methods shouldn't be called if no task is actually migrating */
@@ -2101,7 +2182,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 */
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
-			retval = ss->can_attach(ss, cgrp, leader);
+			retval = ss->can_attach(ss, cgrp, &tset);
 			if (retval) {
 				failed_ss = ss;
 				goto out_cancel_attach;
@@ -2183,10 +2264,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 * being moved, this call will need to be reworked to communicate that.
 	 */
 	for_each_subsys(root, ss) {
-		if (ss->attach) {
-			tc = flex_array_get(group, 0);
-			ss->attach(ss, cgrp, tc->cgrp, tc->task);
-		}
+		if (ss->attach)
+			ss->attach(ss, cgrp, &tset);
 	}
 
 	/*
@@ -2208,11 +2287,11 @@ out_cancel_attach:
 		for_each_subsys(root, ss) {
 			if (ss == failed_ss) {
 				if (cancel_failed_ss && ss->cancel_attach)
-					ss->cancel_attach(ss, cgrp, leader);
+					ss->cancel_attach(ss, cgrp, &tset);
 				break;
 			}
 			if (ss->cancel_attach)
-				ss->cancel_attach(ss, cgrp, leader);
+				ss->cancel_attach(ss, cgrp, &tset);
 		}
 	}
 out_put_tasks:
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e411a60cc2c8..e95c6fb65cc0 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,7 +159,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
  */
 static int freezer_can_attach(struct cgroup_subsys *ss,
 			      struct cgroup *new_cgroup,
-			      struct task_struct *task)
+			      struct cgroup_taskset *tset)
 {
 	struct freezer *freezer;
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9fe58c46a426..512bd59e8627 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1371,10 +1371,10 @@ static int fmeter_getrate(struct fmeter *fmp)
 }
 
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-			     struct task_struct *tsk)
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			     struct cgroup_taskset *tset)
 {
-	struct cpuset *cs = cgroup_cs(cont);
+	struct cpuset *cs = cgroup_cs(cgrp);
 
 	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
@@ -1387,7 +1387,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 	 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
 	 * be changed.
 	 */
-	if (tsk->flags & PF_THREAD_BOUND)
+	if (cgroup_taskset_first(tset)->flags & PF_THREAD_BOUND)
 		return -EINVAL;
 
 	return 0;
@@ -1437,12 +1437,14 @@ static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
 	cpuset_update_task_spread_flag(cs, tsk);
 }
 
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-			  struct cgroup *oldcont, struct task_struct *tsk)
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			  struct cgroup_taskset *tset)
 {
 	struct mm_struct *mm;
-	struct cpuset *cs = cgroup_cs(cont);
-	struct cpuset *oldcs = cgroup_cs(oldcont);
+	struct task_struct *tsk = cgroup_taskset_first(tset);
+	struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
+	struct cpuset *cs = cgroup_cs(cgrp);
+	struct cpuset *oldcs = cgroup_cs(oldcgrp);
 
 	/*
 	 * Change mm, possibly for multiple threads in a threadgroup. This is
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6aff93c98aca..81640e74a709 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5298,8 +5298,9 @@ static void mem_cgroup_clear_mc(void)
 
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
-				struct task_struct *p)
+				struct cgroup_taskset *tset)
 {
+	struct task_struct *p = cgroup_taskset_first(tset);
 	int ret = 0;
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
 
@@ -5337,7 +5338,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
-				struct task_struct *p)
+				struct cgroup_taskset *tset)
 {
 	mem_cgroup_clear_mc();
 }
@@ -5454,9 +5455,9 @@ retry:
 
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *cont,
-				struct cgroup *old_cont,
-				struct task_struct *p)
+				struct cgroup_taskset *tset)
 {
+	struct task_struct *p = cgroup_taskset_first(tset);
 	struct mm_struct *mm = get_task_mm(p);
 
 	if (mm) {
@@ -5471,19 +5472,18 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 #else	/* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
-				struct task_struct *p)
+				struct cgroup_taskset *tset)
 {
 	return 0;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
-				struct task_struct *p)
+				struct cgroup_taskset *tset)
 {
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *cont,
-				struct cgroup *old_cont,
-				struct task_struct *p)
+				struct cgroup_taskset *tset)
 {
 }
 #endif
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 4450fbeec411..8b5b5d8612c6 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -62,11 +62,12 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
 struct cgroup_subsys devices_subsys;
 
 static int devcgroup_can_attach(struct cgroup_subsys *ss,
-		struct cgroup *new_cgroup, struct task_struct *task)
+			struct cgroup *new_cgrp, struct cgroup_taskset *set)
 {
-	if (current != task && !capable(CAP_SYS_ADMIN))
-			return -EPERM;
+	struct task_struct *task = cgroup_taskset_first(set);
 
+	if (current != task && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From bb9d97b6dffa10cec5e1ce9adbce60f3c2b5eabc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 12 Dec 2011 18:12:21 -0800
Subject: cgroup: don't use subsys->can_attach_task() or ->attach_task()

Now that subsys->can_attach() and attach() take @tset instead of
@task, they can handle per-task operations.  Convert
->can_attach_task() and ->attach_task() users to use ->can_attach()
and attach() instead.  Most converions are straight-forward.
Noteworthy changes are,

* In cgroup_freezer, remove unnecessary NULL assignments to unused
  methods.  It's useless and very prone to get out of sync, which
  already happened.

* In cpuset, PF_THREAD_BOUND test is checked for each task.  This
  doesn't make any practical difference but is conceptually cleaner.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <paul@paulmenage.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: James Morris <jmorris@namei.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 block/blk-cgroup.c      | 45 +++++++++++++++++++------------
 kernel/cgroup_freezer.c | 14 +++-------
 kernel/cpuset.c         | 70 ++++++++++++++++++++++---------------------------
 kernel/events/core.c    | 13 +++++----
 kernel/sched.c          | 31 +++++++++++++---------
 5 files changed, 91 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8f630cec906e..b8c143d68ee0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,8 +30,10 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
 						  struct cgroup *);
-static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
-static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
+static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
+			      struct cgroup_taskset *);
+static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
+			   struct cgroup_taskset *);
 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 
@@ -44,8 +46,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 struct cgroup_subsys blkio_subsys = {
 	.name = "blkio",
 	.create = blkiocg_create,
-	.can_attach_task = blkiocg_can_attach_task,
-	.attach_task = blkiocg_attach_task,
+	.can_attach = blkiocg_can_attach,
+	.attach = blkiocg_attach,
 	.destroy = blkiocg_destroy,
 	.populate = blkiocg_populate,
 #ifdef CONFIG_BLK_CGROUP
@@ -1626,30 +1628,39 @@ done:
  * of the main cic data structures.  For now we allow a task to change
  * its cgroup only if it's the only owner of its ioc.
  */
-static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			      struct cgroup_taskset *tset)
 {
+	struct task_struct *task;
 	struct io_context *ioc;
 	int ret = 0;
 
 	/* task_lock() is needed to avoid races with exit_io_context() */
-	task_lock(tsk);
-	ioc = tsk->io_context;
-	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
-		ret = -EINVAL;
-	task_unlock(tsk);
-
+	cgroup_taskset_for_each(task, cgrp, tset) {
+		task_lock(task);
+		ioc = task->io_context;
+		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+			ret = -EINVAL;
+		task_unlock(task);
+		if (ret)
+			break;
+	}
 	return ret;
 }
 
-static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			   struct cgroup_taskset *tset)
 {
+	struct task_struct *task;
 	struct io_context *ioc;
 
-	task_lock(tsk);
-	ioc = tsk->io_context;
-	if (ioc)
-		ioc->cgroup_changed = 1;
-	task_unlock(tsk);
+	cgroup_taskset_for_each(task, cgrp, tset) {
+		task_lock(task);
+		ioc = task->io_context;
+		if (ioc)
+			ioc->cgroup_changed = 1;
+		task_unlock(task);
+	}
 }
 
 void blkio_policy_register(struct blkio_policy_type *blkiop)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e95c6fb65cc0..0e748059ba87 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -162,10 +162,14 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
 			      struct cgroup_taskset *tset)
 {
 	struct freezer *freezer;
+	struct task_struct *task;
 
 	/*
 	 * Anything frozen can't move or be moved to/from.
 	 */
+	cgroup_taskset_for_each(task, new_cgroup, tset)
+		if (cgroup_freezing(task))
+			return -EBUSY;
 
 	freezer = cgroup_freezer(new_cgroup);
 	if (freezer->state != CGROUP_THAWED)
@@ -174,11 +178,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
 	return 0;
 }
 
-static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
-	return cgroup_freezing(tsk) ? -EBUSY : 0;
-}
-
 static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 {
 	struct freezer *freezer;
@@ -374,10 +373,5 @@ struct cgroup_subsys freezer_subsys = {
 	.populate	= freezer_populate,
 	.subsys_id	= freezer_subsys_id,
 	.can_attach	= freezer_can_attach,
-	.can_attach_task = freezer_can_attach_task,
-	.pre_attach	= NULL,
-	.attach_task	= NULL,
-	.attach		= NULL,
 	.fork		= freezer_fork,
-	.exit		= NULL,
 };
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 512bd59e8627..9a8a61301524 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1375,33 +1375,34 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			     struct cgroup_taskset *tset)
 {
 	struct cpuset *cs = cgroup_cs(cgrp);
+	struct task_struct *task;
+	int ret;
 
 	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
 
-	/*
-	 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
-	 * cannot change their cpu affinity and isolating such threads by their
-	 * set of allowed nodes is unnecessary.  Thus, cpusets are not
-	 * applicable for such threads.  This prevents checking for success of
-	 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
-	 * be changed.
-	 */
-	if (cgroup_taskset_first(tset)->flags & PF_THREAD_BOUND)
-		return -EINVAL;
-
+	cgroup_taskset_for_each(task, cgrp, tset) {
+		/*
+		 * Kthreads bound to specific cpus cannot be moved to a new
+		 * cpuset; we cannot change their cpu affinity and
+		 * isolating such threads by their set of allowed nodes is
+		 * unnecessary.  Thus, cpusets are not applicable for such
+		 * threads.  This prevents checking for success of
+		 * set_cpus_allowed_ptr() on all attached tasks before
+		 * cpus_allowed may be changed.
+		 */
+		if (task->flags & PF_THREAD_BOUND)
+			return -EINVAL;
+		if ((ret = security_task_setscheduler(task)))
+			return ret;
+	}
 	return 0;
 }
 
-static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
-{
-	return security_task_setscheduler(task);
-}
-
 /*
  * Protected by cgroup_lock. The nodemasks must be stored globally because
  * dynamically allocating them is not allowed in pre_attach, and they must
- * persist among pre_attach, attach_task, and attach.
+ * persist among pre_attach, and attach.
  */
 static cpumask_var_t cpus_attach;
 static nodemask_t cpuset_attach_nodemask_from;
@@ -1420,39 +1421,34 @@ static void cpuset_pre_attach(struct cgroup *cont)
 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
 }
 
-/* Per-thread attachment work. */
-static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
-{
-	int err;
-	struct cpuset *cs = cgroup_cs(cont);
-
-	/*
-	 * can_attach beforehand should guarantee that this doesn't fail.
-	 * TODO: have a better way to handle failure here
-	 */
-	err = set_cpus_allowed_ptr(tsk, cpus_attach);
-	WARN_ON_ONCE(err);
-
-	cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
-	cpuset_update_task_spread_flag(cs, tsk);
-}
-
 static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			  struct cgroup_taskset *tset)
 {
 	struct mm_struct *mm;
-	struct task_struct *tsk = cgroup_taskset_first(tset);
+	struct task_struct *task;
+	struct task_struct *leader = cgroup_taskset_first(tset);
 	struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
 	struct cpuset *cs = cgroup_cs(cgrp);
 	struct cpuset *oldcs = cgroup_cs(oldcgrp);
 
+	cgroup_taskset_for_each(task, cgrp, tset) {
+		/*
+		 * can_attach beforehand should guarantee that this doesn't
+		 * fail.  TODO: have a better way to handle failure here
+		 */
+		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+		cpuset_update_task_spread_flag(cs, task);
+	}
+
 	/*
 	 * Change mm, possibly for multiple threads in a threadgroup. This is
 	 * expensive and may sleep.
 	 */
 	cpuset_attach_nodemask_from = oldcs->mems_allowed;
 	cpuset_attach_nodemask_to = cs->mems_allowed;
-	mm = get_task_mm(tsk);
+	mm = get_task_mm(leader);
 	if (mm) {
 		mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
 		if (is_memory_migrate(cs))
@@ -1908,9 +1904,7 @@ struct cgroup_subsys cpuset_subsys = {
 	.create = cpuset_create,
 	.destroy = cpuset_destroy,
 	.can_attach = cpuset_can_attach,
-	.can_attach_task = cpuset_can_attach_task,
 	.pre_attach = cpuset_pre_attach,
-	.attach_task = cpuset_attach_task,
 	.attach = cpuset_attach,
 	.populate = cpuset_populate,
 	.post_clone = cpuset_post_clone,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0e8457da6f95..3b8e0edbe693 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7044,10 +7044,13 @@ static int __perf_cgroup_move(void *info)
 	return 0;
 }
 
-static void
-perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			       struct cgroup_taskset *tset)
 {
-	task_function_call(task, __perf_cgroup_move, task);
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, cgrp, tset)
+		task_function_call(task, __perf_cgroup_move, task);
 }
 
 static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -7061,7 +7064,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	if (!(task->flags & PF_EXITING))
 		return;
 
-	perf_cgroup_attach_task(cgrp, task);
+	task_function_call(task, __perf_cgroup_move, task);
 }
 
 struct cgroup_subsys perf_subsys = {
@@ -7070,6 +7073,6 @@ struct cgroup_subsys perf_subsys = {
 	.create		= perf_cgroup_create,
 	.destroy	= perf_cgroup_destroy,
 	.exit		= perf_cgroup_exit,
-	.attach_task	= perf_cgroup_attach_task,
+	.attach		= perf_cgroup_attach,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/sched.c b/kernel/sched.c
index 0e9344a71be3..161184da7b81 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9127,24 +9127,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	sched_destroy_group(tg);
 }
 
-static int
-cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+				 struct cgroup_taskset *tset)
 {
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, cgrp, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
-	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
-		return -EINVAL;
+		if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+			return -EINVAL;
 #else
-	/* We don't support RT-tasks being in separate groups */
-	if (tsk->sched_class != &fair_sched_class)
-		return -EINVAL;
+		/* We don't support RT-tasks being in separate groups */
+		if (task->sched_class != &fair_sched_class)
+			return -EINVAL;
 #endif
+	}
 	return 0;
 }
 
-static void
-cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+			      struct cgroup_taskset *tset)
 {
-	sched_move_task(tsk);
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, cgrp, tset)
+		sched_move_task(task);
 }
 
 static void
@@ -9480,8 +9487,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 	.name		= "cpu",
 	.create		= cpu_cgroup_create,
 	.destroy	= cpu_cgroup_destroy,
-	.can_attach_task = cpu_cgroup_can_attach_task,
-	.attach_task	= cpu_cgroup_attach_task,
+	.can_attach	= cpu_cgroup_can_attach,
+	.attach		= cpu_cgroup_attach,
 	.exit		= cpu_cgroup_exit,
 	.populate	= cpu_cgroup_populate,
 	.subsys_id	= cpu_cgroup_subsys_id,
-- 
cgit v1.2.3-71-gd317


From 94196f51c1ee5bbad674de28c682b17d78adb8e6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 12 Dec 2011 18:12:22 -0800
Subject: cgroup, cpuset: don't use ss->pre_attach()

->pre_attach() is supposed to be called before migration, which is
observed during process migration but task migration does it the other
way around.  The only ->pre_attach() user is cpuset which can do the
same operaitons in ->can_attach().  Collapse cpuset_pre_attach() into
cpuset_can_attach().

-v2: Patch contamination from later patch removed.  Spotted by Paul
     Menage.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Menage <paul@paulmenage.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cpuset.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9a8a61301524..42e568306382 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1370,6 +1370,15 @@ static int fmeter_getrate(struct fmeter *fmp)
 	return val;
 }
 
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in can_attach, and they must
+ * persist until attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			     struct cgroup_taskset *tset)
@@ -1396,29 +1405,16 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		if ((ret = security_task_setscheduler(task)))
 			return ret;
 	}
-	return 0;
-}
-
-/*
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in pre_attach, and they must
- * persist among pre_attach, and attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
-
-/* Set-up work for before attaching each task. */
-static void cpuset_pre_attach(struct cgroup *cont)
-{
-	struct cpuset *cs = cgroup_cs(cont);
 
+	/* prepare for attach */
 	if (cs == &top_cpuset)
 		cpumask_copy(cpus_attach, cpu_possible_mask);
 	else
 		guarantee_online_cpus(cs, cpus_attach);
 
 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+
+	return 0;
 }
 
 static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -1904,7 +1900,6 @@ struct cgroup_subsys cpuset_subsys = {
 	.create = cpuset_create,
 	.destroy = cpuset_destroy,
 	.can_attach = cpuset_can_attach,
-	.pre_attach = cpuset_pre_attach,
 	.attach = cpuset_attach,
 	.populate = cpuset_populate,
 	.post_clone = cpuset_post_clone,
-- 
cgit v1.2.3-71-gd317


From 494c167cf76d02000adf740c215adc69a824ecc9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 12 Dec 2011 18:12:22 -0800
Subject: cgroup: kill subsys->can_attach_task(), pre_attach() and
 attach_task()

These three methods are no longer used.  Kill them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Menage <paul@paulmenage.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 Documentation/cgroups/cgroups.txt | 20 ---------------
 include/linux/cgroup.h            |  3 ---
 kernel/cgroup.c                   | 52 ++++-----------------------------------
 3 files changed, 5 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 8a2f302327fa..a7c96ae5557c 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -615,13 +615,6 @@ fork. If this method returns 0 (success) then this should remain valid
 while the caller holds cgroup_mutex and it is ensured that either
 attach() or cancel_attach() will be called in future.
 
-int can_attach_task(struct cgroup *cgrp, struct task_struct *tsk);
-(cgroup_mutex held by caller)
-
-As can_attach, but for operations that must be run once per task to be
-attached (possibly many when using cgroup_attach_proc). Called after
-can_attach.
-
 void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		   struct cgroup_taskset *tset)
 (cgroup_mutex held by caller)
@@ -632,12 +625,6 @@ function, so that the subsystem can implement a rollback. If not, not necessary.
 This will be called only about subsystems whose can_attach() operation have
 succeeded. The parameters are identical to can_attach().
 
-void pre_attach(struct cgroup *cgrp);
-(cgroup_mutex held by caller)
-
-For any non-per-thread attachment work that needs to happen before
-attach_task. Needed by cpuset.
-
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	    struct cgroup_taskset *tset)
 (cgroup_mutex held by caller)
@@ -646,13 +633,6 @@ Called after the task has been attached to the cgroup, to allow any
 post-attachment activity that requires memory allocations or blocking.
 The parameters are identical to can_attach().
 
-void attach_task(struct cgroup *cgrp, struct task_struct *tsk);
-(cgroup_mutex held by caller)
-
-As attach, but for operations that must be run once per task to be attached,
-like can_attach_task. Called before attach. Currently does not support any
-subsystem that might need the old_cgrp for every thread in the group.
-
 void fork(struct cgroup_subsy *ss, struct task_struct *task)
 
 Called when a task is forked into a cgroup.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 34256ad9e553..7ad5e406c421 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -490,11 +490,8 @@ struct cgroup_subsys {
 	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			  struct cgroup_taskset *tset);
-	int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
 	void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			      struct cgroup_taskset *tset);
-	void (*pre_attach)(struct cgroup *cgrp);
-	void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
 	void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		       struct cgroup_taskset *tset);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 41ee01e392e6..1b3b84174ead 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1944,13 +1944,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 				goto out;
 			}
 		}
-		if (ss->can_attach_task) {
-			retval = ss->can_attach_task(cgrp, tsk);
-			if (retval) {
-				failed_ss = ss;
-				goto out;
-			}
-		}
 	}
 
 	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
@@ -1958,10 +1951,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 		goto out;
 
 	for_each_subsys(root, ss) {
-		if (ss->pre_attach)
-			ss->pre_attach(cgrp);
-		if (ss->attach_task)
-			ss->attach_task(cgrp, tsk);
 		if (ss->attach)
 			ss->attach(ss, cgrp, &tset);
 	}
@@ -2093,7 +2082,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 {
 	int retval, i, group_size, nr_migrating_tasks;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
-	bool cancel_failed_ss = false;
 	/* guaranteed to be initialized later, but the compiler needs this */
 	struct css_set *oldcg;
 	struct cgroupfs_root *root = cgrp->root;
@@ -2188,21 +2176,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 				goto out_cancel_attach;
 			}
 		}
-		/* a callback to be run on every thread in the threadgroup. */
-		if (ss->can_attach_task) {
-			/* run on each task in the threadgroup. */
-			for (i = 0; i < group_size; i++) {
-				tc = flex_array_get(group, i);
-				if (tc->cgrp == cgrp)
-					continue;
-				retval = ss->can_attach_task(cgrp, tc->task);
-				if (retval) {
-					failed_ss = ss;
-					cancel_failed_ss = true;
-					goto out_cancel_attach;
-				}
-			}
-		}
 	}
 
 	/*
@@ -2234,15 +2207,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	}
 
 	/*
-	 * step 3: now that we're guaranteed success wrt the css_sets, proceed
-	 * to move all tasks to the new cgroup, calling ss->attach_task for each
-	 * one along the way. there are no failure cases after here, so this is
-	 * the commit point.
+	 * step 3: now that we're guaranteed success wrt the css_sets,
+	 * proceed to move all tasks to the new cgroup.  There are no
+	 * failure cases after here, so this is the commit point.
 	 */
-	for_each_subsys(root, ss) {
-		if (ss->pre_attach)
-			ss->pre_attach(cgrp);
-	}
 	for (i = 0; i < group_size; i++) {
 		tc = flex_array_get(group, i);
 		/* leave current thread as it is if it's already there */
@@ -2250,18 +2218,11 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 			continue;
 		retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
 		BUG_ON(retval);
-		/* attach each task to each subsystem */
-		for_each_subsys(root, ss) {
-			if (ss->attach_task)
-				ss->attach_task(cgrp, tc->task);
-		}
 	}
 	/* nothing is sensitive to fork() after this point. */
 
 	/*
-	 * step 4: do expensive, non-thread-specific subsystem callbacks.
-	 * TODO: if ever a subsystem needs to know the oldcgrp for each task
-	 * being moved, this call will need to be reworked to communicate that.
+	 * step 4: do subsystem attach callbacks.
 	 */
 	for_each_subsys(root, ss) {
 		if (ss->attach)
@@ -2285,11 +2246,8 @@ out_cancel_attach:
 	/* same deal as in cgroup_attach_task */
 	if (retval) {
 		for_each_subsys(root, ss) {
-			if (ss == failed_ss) {
-				if (cancel_failed_ss && ss->cancel_attach)
-					ss->cancel_attach(ss, cgrp, &tset);
+			if (ss == failed_ss)
 				break;
-			}
 			if (ss->cancel_attach)
 				ss->cancel_attach(ss, cgrp, &tset);
 		}
-- 
cgit v1.2.3-71-gd317


From 52dcf8a1f8ac09b6ea21266ebdc4db6d52eea1fc Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Mon, 5 Dec 2011 22:13:41 +0100
Subject: resource cgroups: remove bogus cast

The memparse() function already accepts const char * as the parsing string.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/res_counter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683efa2cce..6d269cce7aa1 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -159,8 +159,7 @@ int res_counter_memparse_write_strategy(const char *buf,
 		return 0;
 	}
 
-	/* FIXME - make memparse() take const char* args */
-	*res = memparse((char *)buf, &end);
+	*res = memparse(buf, &end);
 	if (*end != '\0')
 		return -EINVAL;
 
-- 
cgit v1.2.3-71-gd317


From 6e736be7f282fff705db7c34a15313281b372a76 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Dec 2011 00:33:38 +0100
Subject: block: make ioc get/put interface more conventional and fix race on
 alloction

Ignoring copy_io() during fork, io_context can be allocated from two
places - current_io_context() and set_task_ioprio().  The former is
always called from local task while the latter can be called from
different task.  The synchornization between them are peculiar and
dubious.

* current_io_context() doesn't grab task_lock() and assumes that if it
  saw %NULL ->io_context, it would stay that way until allocation and
  assignment is complete.  It has smp_wmb() between alloc/init and
  assignment.

* set_task_ioprio() grabs task_lock() for assignment and does
  smp_read_barrier_depends() between "ioc = task->io_context" and "if
  (ioc)".  Unfortunately, this doesn't achieve anything - the latter
  is not a dependent load of the former.  ie, if ioc itself were being
  dereferenced "ioc->xxx", it would mean something (not sure what tho)
  but as the code currently stands, the dependent read barrier is
  noop.

As only one of the the two test-assignment sequences is task_lock()
protected, the task_lock() can't do much about race between the two.
Nothing prevents current_io_context() and set_task_ioprio() allocating
its own ioc for the same task and overwriting the other's.

Also, set_task_ioprio() can race with exiting task and create a new
ioc after exit_io_context() is finished.

ioc get/put doesn't have any reason to be complex.  The only hot path
is accessing the existing ioc of %current, which is simple to achieve
given that ->io_context is never destroyed as long as the task is
alive.  All other paths can happily go through task_lock() like all
other task sub structures without impacting anything.

This patch updates ioc get/put so that it becomes more conventional.

* alloc_io_context() is replaced with get_task_io_context().  This is
  the only interface which can acquire access to ioc of another task.
  On return, the caller has an explicit reference to the object which
  should be put using put_io_context() afterwards.

* The functionality of current_io_context() remains the same but when
  creating a new ioc, it shares the code path with
  get_task_io_context() and always goes through task_lock().

* get_io_context() now means incrementing ref on an ioc which the
  caller already has access to (be that an explicit refcnt or implicit
  %current one).

* PF_EXITING inhibits creation of new io_context and once
  exit_io_context() is finished, it's guaranteed that both ioc
  acquisition functions return %NULL.

* All users are updated.  Most are trivial but
  smp_read_barrier_depends() removal from cfq_get_io_context() needs a
  bit of explanation.  I suppose the original intention was to ensure
  ioc->ioprio is visible when set_task_ioprio() allocates new
  io_context and installs it; however, this wouldn't have worked
  because set_task_ioprio() doesn't have wmb between init and install.
  There are other problems with this which will be fixed in another
  patch.

* While at it, use NUMA_NO_NODE instead of -1 for wildcard node
  specification.

-v2: Vivek spotted contamination from debug patch.  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |  9 +++--
 block/blk-ioc.c           | 99 +++++++++++++++++++++++++++++++----------------
 block/blk.h               |  1 +
 block/cfq-iosched.c       | 18 ++++-----
 fs/ioprio.c               | 21 ++--------
 include/linux/iocontext.h |  4 +-
 kernel/fork.c             |  8 ++--
 7 files changed, 91 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8f630cec906e..4b001dcd85b0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1645,11 +1645,12 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	struct io_context *ioc;
 
-	task_lock(tsk);
-	ioc = tsk->io_context;
-	if (ioc)
+	/* we don't lose anything even if ioc allocation fails */
+	ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
 		ioc->cgroup_changed = 1;
-	task_unlock(tsk);
+		put_io_context(ioc);
+	}
 }
 
 void blkio_policy_register(struct blkio_policy_type *blkiop)
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 8bebf06bac76..b13ed96776c2 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -16,6 +16,19 @@
  */
 static struct kmem_cache *iocontext_cachep;
 
+/**
+ * get_io_context - increment reference count to io_context
+ * @ioc: io_context to get
+ *
+ * Increment reference count to @ioc.
+ */
+void get_io_context(struct io_context *ioc)
+{
+	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	atomic_long_inc(&ioc->refcount);
+}
+EXPORT_SYMBOL(get_io_context);
+
 static void cfq_dtor(struct io_context *ioc)
 {
 	if (!hlist_empty(&ioc->cic_list)) {
@@ -71,6 +84,9 @@ void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
 
+	/* PF_EXITING prevents new io_context from being attached to @task */
+	WARN_ON_ONCE(!(current->flags & PF_EXITING));
+
 	task_lock(task);
 	ioc = task->io_context;
 	task->io_context = NULL;
@@ -82,7 +98,9 @@ void exit_io_context(struct task_struct *task)
 	put_io_context(ioc);
 }
 
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
+static struct io_context *create_task_io_context(struct task_struct *task,
+						 gfp_t gfp_flags, int node,
+						 bool take_ref)
 {
 	struct io_context *ioc;
 
@@ -98,6 +116,20 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->cic_list);
 
+	/* try to install, somebody might already have beaten us to it */
+	task_lock(task);
+
+	if (!task->io_context && !(task->flags & PF_EXITING)) {
+		task->io_context = ioc;
+	} else {
+		kmem_cache_free(iocontext_cachep, ioc);
+		ioc = task->io_context;
+	}
+
+	if (ioc && take_ref)
+		get_io_context(ioc);
+
+	task_unlock(task);
 	return ioc;
 }
 
@@ -114,46 +146,47 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
  */
 struct io_context *current_io_context(gfp_t gfp_flags, int node)
 {
-	struct task_struct *tsk = current;
-	struct io_context *ret;
-
-	ret = tsk->io_context;
-	if (likely(ret))
-		return ret;
-
-	ret = alloc_io_context(gfp_flags, node);
-	if (ret) {
-		/* make sure set_task_ioprio() sees the settings above */
-		smp_wmb();
-		tsk->io_context = ret;
-	}
+	might_sleep_if(gfp_flags & __GFP_WAIT);
 
-	return ret;
+	if (current->io_context)
+		return current->io_context;
+
+	return create_task_io_context(current, gfp_flags, node, false);
 }
+EXPORT_SYMBOL(current_io_context);
 
-/*
- * If the current task has no IO context then create one and initialise it.
- * If it does have a context, take a ref on it.
+/**
+ * get_task_io_context - get io_context of a task
+ * @task: task of interest
+ * @gfp_flags: allocation flags, used if allocation is necessary
+ * @node: allocation node, used if allocation is necessary
+ *
+ * Return io_context of @task.  If it doesn't exist, it is created with
+ * @gfp_flags and @node.  The returned io_context has its reference count
+ * incremented.
  *
- * This is always called in the context of the task which submitted the I/O.
+ * This function always goes through task_lock() and it's better to use
+ * current_io_context() + get_io_context() for %current.
  */
-struct io_context *get_io_context(gfp_t gfp_flags, int node)
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node)
 {
-	struct io_context *ioc = NULL;
-
-	/*
-	 * Check for unlikely race with exiting task. ioc ref count is
-	 * zero when ioc is being detached.
-	 */
-	do {
-		ioc = current_io_context(gfp_flags, node);
-		if (unlikely(!ioc))
-			break;
-	} while (!atomic_long_inc_not_zero(&ioc->refcount));
+	struct io_context *ioc;
 
-	return ioc;
+	might_sleep_if(gfp_flags & __GFP_WAIT);
+
+	task_lock(task);
+	ioc = task->io_context;
+	if (likely(ioc)) {
+		get_io_context(ioc);
+		task_unlock(task);
+		return ioc;
+	}
+	task_unlock(task);
+
+	return create_task_io_context(task, gfp_flags, node, true);
 }
-EXPORT_SYMBOL(get_io_context);
+EXPORT_SYMBOL(get_task_io_context);
 
 static int __init blk_ioc_init(void)
 {
diff --git a/block/blk.h b/block/blk.h
index aae4d88fc523..fc3c41b2fd24 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -122,6 +122,7 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
 }
 #endif
 
+void get_io_context(struct io_context *ioc);
 struct io_context *current_io_context(gfp_t gfp_flags, int node);
 
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ec3f5e8ba564..d42d89ccce1b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include "blk.h"
 #include "cfq.h"
 
 /*
@@ -3194,13 +3195,13 @@ static struct cfq_io_context *
 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
-	struct cfq_io_context *cic;
+	struct cfq_io_context *cic = NULL;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	ioc = get_io_context(gfp_mask, cfqd->queue->node);
+	ioc = current_io_context(gfp_mask, cfqd->queue->node);
 	if (!ioc)
-		return NULL;
+		goto err;
 
 	cic = cfq_cic_lookup(cfqd, ioc);
 	if (cic)
@@ -3211,10 +3212,10 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		goto err;
 
 	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
-		goto err_free;
-
+		goto err;
 out:
-	smp_read_barrier_depends();
+	get_io_context(ioc);
+
 	if (unlikely(ioc->ioprio_changed))
 		cfq_ioc_set_ioprio(ioc);
 
@@ -3223,10 +3224,9 @@ out:
 		cfq_ioc_set_cgroup(ioc);
 #endif
 	return cic;
-err_free:
-	cfq_cic_free(cic);
 err:
-	put_io_context(ioc);
+	if (cic)
+		cfq_cic_free(cic);
 	return NULL;
 }
 
diff --git a/fs/ioprio.c b/fs/ioprio.c
index f79dab83e17b..998ec239d1ea 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -48,28 +48,13 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 	if (err)
 		return err;
 
-	task_lock(task);
-	do {
-		ioc = task->io_context;
-		/* see wmb() in current_io_context() */
-		smp_read_barrier_depends();
-		if (ioc)
-			break;
-
-		ioc = alloc_io_context(GFP_ATOMIC, -1);
-		if (!ioc) {
-			err = -ENOMEM;
-			break;
-		}
-		task->io_context = ioc;
-	} while (1);
-
-	if (!err) {
+	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
 		ioc->ioprio = ioprio;
 		ioc->ioprio_changed = 1;
+		put_io_context(ioc);
 	}
 
-	task_unlock(task);
 	return err;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 8a6ecb66346f..28bb621ef5a2 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -78,8 +78,8 @@ struct task_struct;
 #ifdef CONFIG_BLOCK
 void put_io_context(struct io_context *ioc);
 void exit_io_context(struct task_struct *task);
-struct io_context *get_io_context(gfp_t gfp_flags, int node);
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node);
 #else
 struct io_context;
 static inline void put_io_context(struct io_context *ioc) { }
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d088..5bcfc739bb7c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -870,6 +870,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 #ifdef CONFIG_BLOCK
 	struct io_context *ioc = current->io_context;
+	struct io_context *new_ioc;
 
 	if (!ioc)
 		return 0;
@@ -881,11 +882,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
 	} else if (ioprio_valid(ioc->ioprio)) {
-		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
-		if (unlikely(!tsk->io_context))
+		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+		if (unlikely(!new_ioc))
 			return -ENOMEM;
 
-		tsk->io_context->ioprio = ioc->ioprio;
+		new_ioc->ioprio = ioc->ioprio;
+		put_io_context(new_ioc);
 	}
 #endif
 	return 0;
-- 
cgit v1.2.3-71-gd317


From b2efa05265d62bc29f3a64400fad4b44340eedb8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Dec 2011 00:33:39 +0100
Subject: block, cfq: unlink cfq_io_context's immediately

cic is association between io_context and request_queue.  A cic is
linked from both ioc and q and should be destroyed when either one
goes away.  As ioc and q both have their own locks, locking becomes a
bit complex - both orders work for removal from one but not from the
other.

Currently, cfq tries to circumvent this locking order issue with RCU.
ioc->lock nests inside queue_lock but the radix tree and cic's are
also protected by RCU allowing either side to walk their lists without
grabbing lock.

This rather unconventional use of RCU quickly devolves into extremely
fragile convolution.  e.g. The following is from cfqd going away too
soon after ioc and q exits raced.

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:
 [   88.503444]
 Pid: 599, comm: hexdump Not tainted 3.1.0-rc10-work+ #158 Bochs Bochs
 RIP: 0010:[<ffffffff81397628>]  [<ffffffff81397628>] cfq_exit_single_io_context+0x58/0xf0
 ...
 Call Trace:
  [<ffffffff81395a4a>] call_for_each_cic+0x5a/0x90
  [<ffffffff81395ab5>] cfq_exit_io_context+0x15/0x20
  [<ffffffff81389130>] exit_io_context+0x100/0x140
  [<ffffffff81098a29>] do_exit+0x579/0x850
  [<ffffffff81098d5b>] do_group_exit+0x5b/0xd0
  [<ffffffff81098de7>] sys_exit_group+0x17/0x20
  [<ffffffff81b02f2b>] system_call_fastpath+0x16/0x1b

The only real hot path here is cic lookup during request
initialization and avoiding extra locking requires very confined use
of RCU.  This patch makes cic removal from both ioc and request_queue
perform double-locking and unlink immediately.

* From q side, the change is almost trivial as ioc->lock nests inside
  queue_lock.  It just needs to grab each ioc->lock as it walks
  cic_list and unlink it.

* From ioc side, it's a bit more difficult because of inversed lock
  order.  ioc needs its lock to walk its cic_list but can't grab the
  matching queue_lock and needs to perform unlock-relock dancing.

  Unlinking is now wholly done from put_io_context() and fast path is
  optimized by using the queue_lock the caller already holds, which is
  by far the most common case.  If the ioc accessed multiple devices,
  it tries with trylock.  In unlikely cases of fast path failure, it
  falls back to full double-locking dance from workqueue.

Double-locking isn't the prettiest thing in the world but it's *far*
simpler and more understandable than RCU trick without adding any
meaningful overhead.

This still leaves a lot of now unnecessary RCU logics.  Future patches
will trim them.

-v2: Vivek pointed out that cic->q was being dereferenced after
     cic->release() was called.  Updated to use local variable @this_q
     instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c        |   2 +-
 block/blk-ioc.c           | 166 ++++++++++++++++++++++++++++++++++++++--------
 block/cfq-iosched.c       |  44 +++---------
 fs/ioprio.c               |   2 +-
 include/linux/blkdev.h    |   3 +
 include/linux/iocontext.h |  12 ++--
 kernel/fork.c             |   2 +-
 7 files changed, 159 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index dc00835aab6a..278869358049 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1649,7 +1649,7 @@ static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc_cgroup_changed(ioc);
-		put_io_context(ioc);
+		put_io_context(ioc, NULL);
 	}
 }
 
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 6f59fbad93d9..fb23965595da 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -29,55 +29,164 @@ void get_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(get_io_context);
 
-static void cfq_dtor(struct io_context *ioc)
+/*
+ * Releasing ioc may nest into another put_io_context() leading to nested
+ * fast path release.  As the ioc's can't be the same, this is okay but
+ * makes lockdep whine.  Keep track of nesting and use it as subclass.
+ */
+#ifdef CONFIG_LOCKDEP
+#define ioc_release_depth(q)		((q) ? (q)->ioc_release_depth : 0)
+#define ioc_release_depth_inc(q)	(q)->ioc_release_depth++
+#define ioc_release_depth_dec(q)	(q)->ioc_release_depth--
+#else
+#define ioc_release_depth(q)		0
+#define ioc_release_depth_inc(q)	do { } while (0)
+#define ioc_release_depth_dec(q)	do { } while (0)
+#endif
+
+/*
+ * Slow path for ioc release in put_io_context().  Performs double-lock
+ * dancing to unlink all cic's and then frees ioc.
+ */
+static void ioc_release_fn(struct work_struct *work)
 {
-	if (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic;
+	struct io_context *ioc = container_of(work, struct io_context,
+					      release_work);
+	struct request_queue *last_q = NULL;
+
+	spin_lock_irq(&ioc->lock);
+
+	while (!hlist_empty(&ioc->cic_list)) {
+		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
+							 struct cfq_io_context,
+							 cic_list);
+		struct request_queue *this_q = cic->q;
+
+		if (this_q != last_q) {
+			/*
+			 * Need to switch to @this_q.  Once we release
+			 * @ioc->lock, it can go away along with @cic.
+			 * Hold on to it.
+			 */
+			__blk_get_queue(this_q);
+
+			/*
+			 * blk_put_queue() might sleep thanks to kobject
+			 * idiocy.  Always release both locks, put and
+			 * restart.
+			 */
+			if (last_q) {
+				spin_unlock(last_q->queue_lock);
+				spin_unlock_irq(&ioc->lock);
+				blk_put_queue(last_q);
+			} else {
+				spin_unlock_irq(&ioc->lock);
+			}
+
+			last_q = this_q;
+			spin_lock_irq(this_q->queue_lock);
+			spin_lock(&ioc->lock);
+			continue;
+		}
+		ioc_release_depth_inc(this_q);
+		cic->exit(cic);
+		cic->release(cic);
+		ioc_release_depth_dec(this_q);
+	}
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
-		cic->dtor(ioc);
+	if (last_q) {
+		spin_unlock(last_q->queue_lock);
+		spin_unlock_irq(&ioc->lock);
+		blk_put_queue(last_q);
+	} else {
+		spin_unlock_irq(&ioc->lock);
 	}
+
+	kmem_cache_free(iocontext_cachep, ioc);
 }
 
 /**
  * put_io_context - put a reference of io_context
  * @ioc: io_context to put
+ * @locked_q: request_queue the caller is holding queue_lock of (hint)
  *
  * Decrement reference count of @ioc and release it if the count reaches
- * zero.
+ * zero.  If the caller is holding queue_lock of a queue, it can indicate
+ * that with @locked_q.  This is an optimization hint and the caller is
+ * allowed to pass in %NULL even when it's holding a queue_lock.
  */
-void put_io_context(struct io_context *ioc)
+void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
 {
+	struct request_queue *last_q = locked_q;
+	unsigned long flags;
+
 	if (ioc == NULL)
 		return;
 
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	if (locked_q)
+		lockdep_assert_held(locked_q->queue_lock);
 
 	if (!atomic_long_dec_and_test(&ioc->refcount))
 		return;
 
-	rcu_read_lock();
-	cfq_dtor(ioc);
-	rcu_read_unlock();
-
-	kmem_cache_free(iocontext_cachep, ioc);
-}
-EXPORT_SYMBOL(put_io_context);
+	/*
+	 * Destroy @ioc.  This is a bit messy because cic's are chained
+	 * from both ioc and queue, and ioc->lock nests inside queue_lock.
+	 * The inner ioc->lock should be held to walk our cic_list and then
+	 * for each cic the outer matching queue_lock should be grabbed.
+	 * ie. We need to do reverse-order double lock dancing.
+	 *
+	 * Another twist is that we are often called with one of the
+	 * matching queue_locks held as indicated by @locked_q, which
+	 * prevents performing double-lock dance for other queues.
+	 *
+	 * So, we do it in two stages.  The fast path uses the queue_lock
+	 * the caller is holding and, if other queues need to be accessed,
+	 * uses trylock to avoid introducing locking dependency.  This can
+	 * handle most cases, especially if @ioc was performing IO on only
+	 * single device.
+	 *
+	 * If trylock doesn't cut it, we defer to @ioc->release_work which
+	 * can do all the double-locking dancing.
+	 */
+	spin_lock_irqsave_nested(&ioc->lock, flags,
+				 ioc_release_depth(locked_q));
+
+	while (!hlist_empty(&ioc->cic_list)) {
+		struct cfq_io_context *cic = hlist_entry(ioc->cic_list.first,
+							 struct cfq_io_context,
+							 cic_list);
+		struct request_queue *this_q = cic->q;
+
+		if (this_q != last_q) {
+			if (last_q && last_q != locked_q)
+				spin_unlock(last_q->queue_lock);
+			last_q = NULL;
+
+			if (!spin_trylock(this_q->queue_lock))
+				break;
+			last_q = this_q;
+			continue;
+		}
+		ioc_release_depth_inc(this_q);
+		cic->exit(cic);
+		cic->release(cic);
+		ioc_release_depth_dec(this_q);
+	}
 
-static void cfq_exit(struct io_context *ioc)
-{
-	rcu_read_lock();
+	if (last_q && last_q != locked_q)
+		spin_unlock(last_q->queue_lock);
 
-	if (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic;
+	spin_unlock_irqrestore(&ioc->lock, flags);
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
-		cic->exit(ioc);
-	}
-	rcu_read_unlock();
+	/* if no cic's left, we're done; otherwise, kick release_work */
+	if (hlist_empty(&ioc->cic_list))
+		kmem_cache_free(iocontext_cachep, ioc);
+	else
+		schedule_work(&ioc->release_work);
 }
+EXPORT_SYMBOL(put_io_context);
 
 /* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
@@ -92,10 +201,8 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-	if (atomic_dec_and_test(&ioc->nr_tasks))
-		cfq_exit(ioc);
-
-	put_io_context(ioc);
+	atomic_dec(&ioc->nr_tasks);
+	put_io_context(ioc, NULL);
 }
 
 static struct io_context *create_task_io_context(struct task_struct *task,
@@ -115,6 +222,7 @@ static struct io_context *create_task_io_context(struct task_struct *task,
 	spin_lock_init(&ioc->lock);
 	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->cic_list);
+	INIT_WORK(&ioc->release_work, ioc_release_fn);
 
 	/* try to install, somebody might already have beaten us to it */
 	task_lock(task);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e617b088c59b..6cc606560402 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1778,7 +1778,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqd->active_queue = NULL;
 
 	if (cfqd->active_cic) {
-		put_io_context(cfqd->active_cic->ioc);
+		put_io_context(cfqd->active_cic->ioc, cfqd->queue);
 		cfqd->active_cic = NULL;
 	}
 }
@@ -2812,38 +2812,6 @@ static void cfq_exit_cic(struct cfq_io_context *cic)
 	}
 }
 
-static void cfq_exit_single_io_context(struct io_context *ioc,
-				       struct cfq_io_context *cic)
-{
-	struct cfq_data *cfqd = cic_to_cfqd(cic);
-
-	if (cfqd) {
-		struct request_queue *q = cfqd->queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
-
-		/*
-		 * Ensure we get a fresh copy of the ->key to prevent
-		 * race between exiting task and queue
-		 */
-		smp_read_barrier_depends();
-		if (cic->key == cfqd)
-			cfq_exit_cic(cic);
-
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
-}
-
-/*
- * The process that ioc belongs to has exited, we need to clean up
- * and put the internal structures we have that belongs to that process.
- */
-static void cfq_exit_io_context(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, cfq_exit_single_io_context);
-}
-
 static struct cfq_io_context *
 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
@@ -2855,8 +2823,8 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		cic->ttime.last_end_request = jiffies;
 		INIT_LIST_HEAD(&cic->queue_list);
 		INIT_HLIST_NODE(&cic->cic_list);
-		cic->dtor = cfq_free_io_context;
-		cic->exit = cfq_exit_io_context;
+		cic->exit = cfq_exit_cic;
+		cic->release = cfq_release_cic;
 		elv_ioc_count_inc(cfq_ioc_count);
 	}
 
@@ -3726,7 +3694,7 @@ static void cfq_put_request(struct request *rq)
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
 
-		put_io_context(RQ_CIC(rq)->ioc);
+		put_io_context(RQ_CIC(rq)->ioc, cfqq->cfqd->queue);
 
 		rq->elevator_private[0] = NULL;
 		rq->elevator_private[1] = NULL;
@@ -3937,8 +3905,12 @@ static void cfq_exit_queue(struct elevator_queue *e)
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
 							struct cfq_io_context,
 							queue_list);
+		struct io_context *ioc = cic->ioc;
 
+		spin_lock(&ioc->lock);
 		cfq_exit_cic(cic);
+		cfq_release_cic(cic);
+		spin_unlock(&ioc->lock);
 	}
 
 	cfq_put_async_queues(cfqd);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 0f1b9515213b..f84b380d65e5 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc_ioprio_changed(ioc, ioprio);
-		put_io_context(ioc);
+		put_io_context(ioc, NULL);
 	}
 
 	return err;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d1b6f4ed1f96..65c2f8c70089 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -393,6 +393,9 @@ struct request_queue {
 	/* Throttle data */
 	struct throtl_data *td;
 #endif
+#ifdef CONFIG_LOCKDEP
+	int			ioc_release_depth;
+#endif
 };
 
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 2c2b6da96b3c..01e863128780 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -3,6 +3,7 @@
 
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
+#include <linux/workqueue.h>
 
 struct cfq_queue;
 struct cfq_ttime {
@@ -33,8 +34,8 @@ struct cfq_io_context {
 
 	unsigned long changed;
 
-	void (*dtor)(struct io_context *); /* destructor */
-	void (*exit)(struct io_context *); /* called on task exit */
+	void (*exit)(struct cfq_io_context *);
+	void (*release)(struct cfq_io_context *);
 
 	struct rcu_head rcu_head;
 };
@@ -61,6 +62,8 @@ struct io_context {
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
 	void __rcu *ioc_data;
+
+	struct work_struct release_work;
 };
 
 static inline struct io_context *ioc_task_link(struct io_context *ioc)
@@ -79,7 +82,7 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 
 struct task_struct;
 #ifdef CONFIG_BLOCK
-void put_io_context(struct io_context *ioc);
+void put_io_context(struct io_context *ioc, struct request_queue *locked_q);
 void exit_io_context(struct task_struct *task);
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node);
@@ -87,7 +90,8 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
 void ioc_cgroup_changed(struct io_context *ioc);
 #else
 struct io_context;
-static inline void put_io_context(struct io_context *ioc) { }
+static inline void put_io_context(struct io_context *ioc,
+				  struct request_queue *locked_q) { }
 static inline void exit_io_context(struct task_struct *task) { }
 #endif
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5bcfc739bb7c..2753449f2038 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -887,7 +887,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 			return -ENOMEM;
 
 		new_ioc->ioprio = ioc->ioprio;
-		put_io_context(new_ioc);
+		put_io_context(new_ioc, NULL);
 	}
 #endif
 	return 0;
-- 
cgit v1.2.3-71-gd317


From 997d3eaf02cad6fdb54bd6085c9a7c48ddd68a2d Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Wed, 14 Dec 2011 14:54:22 -0800
Subject: rtmutex-tester: convert sysdev_class to a regular subsystem

After all sysdev classes are ported to regular driver core entities, the
sysdev implementation will be entirely removed from the kernel.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/rtmutex-tester.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 3d9f31cd79e7..98ec49475460 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,11 +6,11 @@
  *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  *
  */
+#include <linux/device.h>
 #include <linux/kthread.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
-#include <linux/sysdev.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
 
@@ -27,7 +27,7 @@ struct test_thread_data {
 	int			opdata;
 	int			mutexes[MAX_RT_TEST_MUTEXES];
 	int			event;
-	struct sys_device	sysdev;
+	struct device		dev;
 };
 
 static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
@@ -271,7 +271,7 @@ static int test_func(void *data)
  *
  * opcode:data
  */
-static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
 				  const char *buf, size_t count)
 {
 	struct sched_param schedpar;
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
 	char cmdbuf[32];
 	int op, dat, tid, ret;
 
-	td = container_of(dev, struct test_thread_data, sysdev);
-	tid = td->sysdev.id;
+	td = container_of(dev, struct test_thread_data, dev);
+	tid = td->dev.id;
 
 	/* strings from sysfs write are not 0 terminated! */
 	if (count >= sizeof(cmdbuf))
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
  * @dev:	thread to query
  * @buf:	char buffer to be filled with thread status info
  */
-static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
 				 char *buf)
 {
 	struct test_thread_data *td;
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
 	char *curr = buf;
 	int i;
 
-	td = container_of(dev, struct test_thread_data, sysdev);
-	tsk = threads[td->sysdev.id];
+	td = container_of(dev, struct test_thread_data, dev);
+	tsk = threads[td->dev.id];
 
 	spin_lock(&rttest_lock);
 
@@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
 	spin_unlock(&rttest_lock);
 
 	curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
-			mutexes[td->sysdev.id].owner);
+			mutexes[td->dev.id].owner);
 
 	return curr - buf;
 }
 
-static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
-static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
+static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
 
-static struct sysdev_class rttest_sysclass = {
+static struct bus_type rttest_subsys = {
 	.name = "rttest",
+	.dev_name = "rttest",
 };
 
 static int init_test_thread(int id)
 {
-	thread_data[id].sysdev.cls = &rttest_sysclass;
-	thread_data[id].sysdev.id = id;
+	thread_data[id].dev.bus = &rttest_subsys;
+	thread_data[id].dev.id = id;
 
 	threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
 	if (IS_ERR(threads[id]))
 		return PTR_ERR(threads[id]);
 
-	return sysdev_register(&thread_data[id].sysdev);
+	return device_register(&thread_data[id].dev);
 }
 
 static int init_rttest(void)
@@ -393,7 +394,7 @@ static int init_rttest(void)
 	for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
 		rt_mutex_init(&mutexes[i]);
 
-	ret = sysdev_class_register(&rttest_sysclass);
+	ret = subsys_system_register(&rttest_subsys, NULL);
 	if (ret)
 		return ret;
 
@@ -401,10 +402,10 @@ static int init_rttest(void)
 		ret = init_test_thread(i);
 		if (ret)
 			break;
-		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+		ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
 		if (ret)
 			break;
-		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+		ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
 		if (ret)
 			break;
 	}
-- 
cgit v1.2.3-71-gd317


From d369a5d8fc70710236ae2d06a0e42dce483712df Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Wed, 14 Dec 2011 15:28:51 -0800
Subject: clocksource: convert sysdev_class to a regular subsystem

After all sysdev classes are ported to regular driver core entities, the
sysdev implementation will be entirely removed from the kernel.

Cc: John Stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/time/clocksource.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cf52fda2e096..3f5c8512c033 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,8 +23,8 @@
  *   o Allow clocksource drivers to be unregistered
  */
 
+#include <linux/device.h>
 #include <linux/clocksource.h>
-#include <linux/sysdev.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
@@ -754,8 +754,8 @@ EXPORT_SYMBOL(clocksource_unregister);
  * Provides sysfs interface for listing current clocksource.
  */
 static ssize_t
-sysfs_show_current_clocksources(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+sysfs_show_current_clocksources(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	ssize_t count = 0;
 
@@ -775,8 +775,8 @@ sysfs_show_current_clocksources(struct sys_device *dev,
  * Takes input from sysfs interface for manually overriding the default
  * clocksource selection.
  */
-static ssize_t sysfs_override_clocksource(struct sys_device *dev,
-					  struct sysdev_attribute *attr,
+static ssize_t sysfs_override_clocksource(struct device *dev,
+					  struct device_attribute *attr,
 					  const char *buf, size_t count)
 {
 	size_t ret = count;
@@ -809,8 +809,8 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
  * Provides sysfs interface for listing registered clocksources
  */
 static ssize_t
-sysfs_show_available_clocksources(struct sys_device *dev,
-				  struct sysdev_attribute *attr,
+sysfs_show_available_clocksources(struct device *dev,
+				  struct device_attribute *attr,
 				  char *buf)
 {
 	struct clocksource *src;
@@ -839,35 +839,36 @@ sysfs_show_available_clocksources(struct sys_device *dev,
 /*
  * Sysfs setup bits:
  */
-static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
+static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
 		   sysfs_override_clocksource);
 
-static SYSDEV_ATTR(available_clocksource, 0444,
+static DEVICE_ATTR(available_clocksource, 0444,
 		   sysfs_show_available_clocksources, NULL);
 
-static struct sysdev_class clocksource_sysclass = {
+static struct bus_type clocksource_subsys = {
 	.name = "clocksource",
+	.dev_name = "clocksource",
 };
 
-static struct sys_device device_clocksource = {
+static struct device device_clocksource = {
 	.id	= 0,
-	.cls	= &clocksource_sysclass,
+	.bus	= &clocksource_subsys,
 };
 
 static int __init init_clocksource_sysfs(void)
 {
-	int error = sysdev_class_register(&clocksource_sysclass);
+	int error = subsys_system_register(&clocksource_subsys, NULL);
 
 	if (!error)
-		error = sysdev_register(&device_clocksource);
+		error = device_register(&device_clocksource);
 	if (!error)
-		error = sysdev_create_file(
+		error = device_create_file(
 				&device_clocksource,
-				&attr_current_clocksource);
+				&dev_attr_current_clocksource);
 	if (!error)
-		error = sysdev_create_file(
+		error = device_create_file(
 				&device_clocksource,
-				&attr_available_clocksource);
+				&dev_attr_available_clocksource);
 	return error;
 }
 
-- 
cgit v1.2.3-71-gd317


From 54848d73f9f254631303d6eab9b976855988b266 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 5 Apr 2011 13:21:19 -0600
Subject: writeback: charge leaked page dirties to active tasks

It's a years long problem that a large number of short-lived dirtiers
(eg. gcc instances in a fast kernel build) may starve long-run dirtiers
(eg. dd) as well as pushing the dirty pages to the global hard limit.

The solution is to charge the pages dirtied by the exited gcc to the
other random dirtying tasks. It sounds not perfect, however should
behave good enough in practice, seeing as that throttled tasks aren't
actually running so those that are running are more likely to pick it up
and get throttled, therefore promoting an equal spread.

Randy: fix compile error: 'dirty_throttle_leaks' undeclared in exit.c

Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/writeback.h |  2 ++
 kernel/exit.c             |  3 +++
 mm/page-writeback.c       | 27 +++++++++++++++++++++++++++
 3 files changed, 32 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a378c295851f..05eaf5e3aad7 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -7,6 +7,8 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 
+DECLARE_PER_CPU(int, dirty_throttle_leaks);
+
 /*
  * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
  *
diff --git a/kernel/exit.c b/kernel/exit.c
index d0b7d988f873..d4aac24cc469 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
+#include <linux/writeback.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1037,6 +1038,8 @@ NORET_TYPE void do_exit(long code)
 	validate_creds_for_do_exit(tsk);
 
 	preempt_disable();
+	if (tsk->nr_dirtied)
+		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 50f08241f981..619c445fc03c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1214,6 +1214,22 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
 
 static DEFINE_PER_CPU(int, bdp_ratelimits);
 
+/*
+ * Normal tasks are throttled by
+ *	loop {
+ *		dirty tsk->nr_dirtied_pause pages;
+ *		take a snap in balance_dirty_pages();
+ *	}
+ * However there is a worst case. If every task exit immediately when dirtied
+ * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
+ * called to throttle the page dirties. The solution is to save the not yet
+ * throttled page dirties in dirty_throttle_leaks on task exit and charge them
+ * randomly into the running tasks. This works well for the above worst case,
+ * as the new task will pick up and accumulate the old task's leaked dirty
+ * count and eventually get throttled.
+ */
+DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
+
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
@@ -1261,6 +1277,17 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 			ratelimit = 0;
 		}
 	}
+	/*
+	 * Pick up the dirtied pages by the exited tasks. This avoids lots of
+	 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
+	 * the dirty throttling and livelock other long-run dirtiers.
+	 */
+	p = &__get_cpu_var(dirty_throttle_leaks);
+	if (*p > 0 && current->nr_dirtied < ratelimit) {
+		nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
+		*p -= nr_pages_dirtied;
+		current->nr_dirtied += nr_pages_dirtied;
+	}
 	preempt_enable();
 
 	if (unlikely(current->nr_dirtied >= ratelimit))
-- 
cgit v1.2.3-71-gd317


From 83712358ba0a1497ce59a4f84ce4dd0f803fe6fc Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sat, 11 Jun 2011 19:25:42 -0600
Subject: writeback: dirty ratelimit - think time compensation

Compensate the task's think time when computing the final pause time,
so that ->dirty_ratelimit can be executed accurately.

        think time := time spend outside of balance_dirty_pages()

In the rare case that the task slept longer than the 200ms period time
(result in negative pause time), the sleep time will be compensated in
the following periods, too, if it's less than 1 second.

Accumulated errors are carefully avoided as long as the max pause area
is not hitted.

Pseudo code:

        period = pages_dirtied / task_ratelimit;
        think = jiffies - dirty_paused_when;
        pause = period - think;

1) normal case: period > think

        pause = period - think
        dirty_paused_when = jiffies + pause
        nr_dirtied = 0

                             period time
              |===============================>|
                  think time      pause time
              |===============>|==============>|
        ------|----------------|---------------|------------------------
        dirty_paused_when   jiffies

2) no pause case: period <= think

        don't pause; reduce future pause time by:
        dirty_paused_when += period
        nr_dirtied = 0

                           period time
              |===============================>|
                                  think time
              |===================================================>|
        ------|--------------------------------+-------------------|----
        dirty_paused_when                                       jiffies

Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/sched.h            |  1 +
 include/trace/events/writeback.h | 14 +++++++++++---
 kernel/fork.c                    |  1 +
 mm/page-writeback.c              | 36 ++++++++++++++++++++++++++++++++----
 4 files changed, 45 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c4f3e9b9bc5..984c3b295978 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1527,6 +1527,7 @@ struct task_struct {
 	 */
 	int nr_dirtied;
 	int nr_dirtied_pause;
+	unsigned long dirty_paused_when; /* start of a write-and-pause period */
 
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 99d1d0decf88..8588a8918023 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -300,12 +300,13 @@ TRACE_EVENT(balance_dirty_pages,
 		 unsigned long dirty_ratelimit,
 		 unsigned long task_ratelimit,
 		 unsigned long dirtied,
+		 unsigned long period,
 		 long pause,
 		 unsigned long start_time),
 
 	TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
 		dirty_ratelimit, task_ratelimit,
-		dirtied, pause, start_time),
+		dirtied, period, pause, start_time),
 
 	TP_STRUCT__entry(
 		__array(	 char,	bdi, 32)
@@ -320,6 +321,8 @@ TRACE_EVENT(balance_dirty_pages,
 		__field(unsigned int,	dirtied_pause)
 		__field(unsigned long,	paused)
 		__field(	 long,	pause)
+		__field(unsigned long,	period)
+		__field(	 long,	think)
 	),
 
 	TP_fast_assign(
@@ -336,6 +339,9 @@ TRACE_EVENT(balance_dirty_pages,
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->dirtied	= dirtied;
 		__entry->dirtied_pause	= current->nr_dirtied_pause;
+		__entry->think		= current->dirty_paused_when == 0 ? 0 :
+			 (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
+		__entry->period		= period * 1000 / HZ;
 		__entry->pause		= pause * 1000 / HZ;
 		__entry->paused		= (jiffies - start_time) * 1000 / HZ;
 	),
@@ -346,7 +352,7 @@ TRACE_EVENT(balance_dirty_pages,
 		  "bdi_setpoint=%lu bdi_dirty=%lu "
 		  "dirty_ratelimit=%lu task_ratelimit=%lu "
 		  "dirtied=%u dirtied_pause=%u "
-		  "paused=%lu pause=%ld",
+		  "paused=%lu pause=%ld period=%lu think=%ld",
 		  __entry->bdi,
 		  __entry->limit,
 		  __entry->setpoint,
@@ -358,7 +364,9 @@ TRACE_EVENT(balance_dirty_pages,
 		  __entry->dirtied,
 		  __entry->dirtied_pause,
 		  __entry->paused,	/* ms */
-		  __entry->pause	/* ms */
+		  __entry->pause,	/* ms */
+		  __entry->period,	/* ms */
+		  __entry->think	/* ms */
 	  )
 );
 
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d088..f8668cf6a32d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->nr_dirtied = 0;
 	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+	p->dirty_paused_when = 0;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 96b3e7aa705c..491932155825 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1016,6 +1016,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
+	long period;
 	long pause = 0;
 	long uninitialized_var(max_pause);
 	bool dirty_exceeded = false;
@@ -1026,6 +1027,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long start_time = jiffies;
 
 	for (;;) {
+		unsigned long now = jiffies;
+
 		/*
 		 * Unstable writes are a feature of certain networked
 		 * filesystems (i.e. NFS) in which data may have been
@@ -1045,8 +1048,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 */
 		freerun = dirty_freerun_ceiling(dirty_thresh,
 						background_thresh);
-		if (nr_dirty <= freerun)
+		if (nr_dirty <= freerun) {
+			current->dirty_paused_when = now;
+			current->nr_dirtied = 0;
 			break;
+		}
 
 		if (unlikely(!writeback_in_progress(bdi)))
 			bdi_start_background_writeback(bdi);
@@ -1104,10 +1110,21 @@ static void balance_dirty_pages(struct address_space *mapping,
 		task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
 							RATELIMIT_CALC_SHIFT;
 		if (unlikely(task_ratelimit == 0)) {
+			period = max_pause;
 			pause = max_pause;
 			goto pause;
 		}
-		pause = HZ * pages_dirtied / task_ratelimit;
+		period = HZ * pages_dirtied / task_ratelimit;
+		pause = period;
+		if (current->dirty_paused_when)
+			pause -= now - current->dirty_paused_when;
+		/*
+		 * For less than 1s think time (ext3/4 may block the dirtier
+		 * for up to 800ms from time to time on 1-HDD; so does xfs,
+		 * however at much less frequency), try to compensate it in
+		 * future periods by updating the virtual time; otherwise just
+		 * do a reset, as it may be a light dirtier.
+		 */
 		if (unlikely(pause <= 0)) {
 			trace_balance_dirty_pages(bdi,
 						  dirty_thresh,
@@ -1118,8 +1135,16 @@ static void balance_dirty_pages(struct address_space *mapping,
 						  dirty_ratelimit,
 						  task_ratelimit,
 						  pages_dirtied,
+						  period,
 						  pause,
 						  start_time);
+			if (pause < -HZ) {
+				current->dirty_paused_when = now;
+				current->nr_dirtied = 0;
+			} else if (period) {
+				current->dirty_paused_when += period;
+				current->nr_dirtied = 0;
+			}
 			pause = 1; /* avoid resetting nr_dirtied_pause below */
 			break;
 		}
@@ -1135,11 +1160,15 @@ pause:
 					  dirty_ratelimit,
 					  task_ratelimit,
 					  pages_dirtied,
+					  period,
 					  pause,
 					  start_time);
 		__set_current_state(TASK_KILLABLE);
 		io_schedule_timeout(pause);
 
+		current->dirty_paused_when = now + pause;
+		current->nr_dirtied = 0;
+
 		/*
 		 * This is typically equal to (nr_dirty < dirty_thresh) and can
 		 * also keep "1000+ dd on a slow USB stick" under control.
@@ -1167,11 +1196,10 @@ pause:
 	if (!dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
 
-	current->nr_dirtied = 0;
 	if (pause == 0) { /* in freerun area */
 		current->nr_dirtied_pause =
 				dirty_poll_interval(nr_dirty, dirty_thresh);
-	} else if (pause <= max_pause / 4 &&
+	} else if (period <= max_pause / 4 &&
 		   pages_dirtied >= current->nr_dirtied_pause) {
 		current->nr_dirtied_pause = clamp_val(
 					dirty_ratelimit * (max_pause / 2) / HZ,
-- 
cgit v1.2.3-71-gd317


From 29e21368b9baf9c4b25060d65062da2dda926c70 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Thu, 15 Dec 2011 14:21:26 -0800
Subject: cgroups: remove redundant get/put of css_set from
 css_set_check_fetched()

We already have a reference to all elements in newcg_list.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: containers@lists.linux-foundation.org
Cc: cgroups@vger.kernel.org
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/cgroup.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1b3b84174ead..bc3caff138d8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2025,23 +2025,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
 
 	read_lock(&css_set_lock);
 	newcg = find_existing_css_set(cg, cgrp, template);
-	if (newcg)
-		get_css_set(newcg);
 	read_unlock(&css_set_lock);
 
 	/* doesn't exist at all? */
 	if (!newcg)
 		return false;
 	/* see if it's already in the list */
-	list_for_each_entry(cg_entry, newcg_list, links) {
-		if (cg_entry->cg == newcg) {
-			put_css_set(newcg);
+	list_for_each_entry(cg_entry, newcg_list, links)
+		if (cg_entry->cg == newcg)
 			return true;
-		}
-	}
 
 	/* not found */
-	put_css_set(newcg);
 	return false;
 }
 
-- 
cgit v1.2.3-71-gd317


From 30fb6aa74011dcf595f306ca2727254d708b786e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 5 Dec 2011 18:22:48 +0100
Subject: ftrace: Fix unregister ftrace_ops accounting

Multiple users of the function tracer can register their functions
with the ftrace_ops structure. The accounting within ftrace will
update the counter on each function record that is being traced.
When the ftrace_ops filtering adds or removes functions, the
function records will be updated accordingly if the ftrace_ops is
still registered.

When a ftrace_ops is removed, the counter of the function records,
that the ftrace_ops traces, are decremented. When they reach zero
the functions that they represent are modified to stop calling the
mcount code.

When changes are made, the code is updated via stop_machine() with
a command passed to the function to tell it what to do. There is an
ENABLE and DISABLE command that tells the called function to enable
or disable the functions. But the ENABLE is really a misnomer as it
should just update the records, as records that have been enabled
and now have a count of zero should be disabled.

The DISABLE command is used to disable all functions regardless of
their counter values. This is the big off switch and is not the
complement of the ENABLE command.

To make matters worse, when a ftrace_ops is unregistered and there
is another ftrace_ops registered, neither the DISABLE nor the
ENABLE command are set when calling into the stop_machine() function
and the records will not be updated to match their counter. A command
is passed to that function that will update the mcount code to call
the registered callback directly if it is the only one left. This
means that the ftrace_ops that is still registered will have its callback
called by all functions that have been set for it as well as the ftrace_ops
that was just unregistered.

Here's a way to trigger this bug. Compile the kernel with
CONFIG_FUNCTION_PROFILER set and with CONFIG_FUNCTION_GRAPH not set:

 CONFIG_FUNCTION_PROFILER=y
 # CONFIG_FUNCTION_GRAPH is not set

This will force the function profiler to use the function tracer instead
of the function graph tracer.

  # cd /sys/kernel/debug/tracing
  # echo schedule > set_ftrace_filter
  # echo function > current_tracer
  # cat set_ftrace_filter
 schedule
  # cat trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 692/68108025   #P:4
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
      kworker/0:2-909   [000] ....   531.235574: schedule <-worker_thread
           <idle>-0     [001] .N..   531.235575: schedule <-cpu_idle
      kworker/0:2-909   [000] ....   531.235597: schedule <-worker_thread
             sshd-2563  [001] ....   531.235647: schedule <-schedule_hrtimeout_range_clock

  # echo 1 > function_profile_enabled
  # echo 0 > function_porfile_enabled
  # cat set_ftrace_filter
 schedule
  # cat trace
 # tracer: function
 #
 # entries-in-buffer/entries-written: 159701/118821262   #P:4
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
           <idle>-0     [002] ...1   604.870655: local_touch_nmi <-cpu_idle
           <idle>-0     [002] d..1   604.870655: enter_idle <-cpu_idle
           <idle>-0     [002] d..1   604.870656: atomic_notifier_call_chain <-enter_idle
           <idle>-0     [002] d..1   604.870656: __atomic_notifier_call_chain <-atomic_notifier_call_chain

The same problem could have happened with the trace_probe_ops,
but they are modified with the set_frace_filter file which does the
update at closure of the file.

The simple solution is to change ENABLE to UPDATE and call it every
time an ftrace_ops is unregistered.

Link: http://lkml.kernel.org/r/1323105776-26961-3-git-send-email-jolsa@redhat.com

Cc: stable@vger.kernel.org # 3.0+
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b1e8943fed1d..25b4f4da0fe8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -948,7 +948,7 @@ struct ftrace_func_probe {
 };
 
 enum {
-	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_UPDATE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
 	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
 	FTRACE_START_FUNC_RET		= (1 << 3),
@@ -1519,7 +1519,7 @@ int ftrace_text_reserved(void *start, void *end)
 
 
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, int update)
 {
 	unsigned long ftrace_addr;
 	unsigned long flag = 0UL;
@@ -1527,17 +1527,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	ftrace_addr = (unsigned long)FTRACE_ADDR;
 
 	/*
-	 * If we are enabling tracing:
+	 * If we are updating calls:
 	 *
 	 *   If the record has a ref count, then we need to enable it
 	 *   because someone is using it.
 	 *
 	 *   Otherwise we make sure its disabled.
 	 *
-	 * If we are disabling tracing, then disable all records that
+	 * If we are disabling calls, then disable all records that
 	 * are enabled.
 	 */
-	if (enable && (rec->flags & ~FTRACE_FL_MASK))
+	if (update && (rec->flags & ~FTRACE_FL_MASK))
 		flag = FTRACE_FL_ENABLED;
 
 	/* If the state of this record hasn't changed, then do nothing */
@@ -1553,7 +1553,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	return ftrace_make_nop(NULL, rec, ftrace_addr);
 }
 
-static void ftrace_replace_code(int enable)
+static void ftrace_replace_code(int update)
 {
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
@@ -1567,7 +1567,7 @@ static void ftrace_replace_code(int enable)
 		if (rec->flags & FTRACE_FL_FREE)
 			continue;
 
-		failed = __ftrace_replace_code(rec, enable);
+		failed = __ftrace_replace_code(rec, update);
 		if (failed) {
 			ftrace_bug(failed, rec->ip);
 			/* Stop processing */
@@ -1623,7 +1623,7 @@ static int __ftrace_modify_code(void *data)
 	 */
 	function_trace_stop++;
 
-	if (*command & FTRACE_ENABLE_CALLS)
+	if (*command & FTRACE_UPDATE_CALLS)
 		ftrace_replace_code(1);
 	else if (*command & FTRACE_DISABLE_CALLS)
 		ftrace_replace_code(0);
@@ -1691,7 +1691,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
 		return -ENODEV;
 
 	ftrace_start_up++;
-	command |= FTRACE_ENABLE_CALLS;
+	command |= FTRACE_UPDATE_CALLS;
 
 	/* ops marked global share the filter hashes */
 	if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
@@ -1743,8 +1743,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
 	if (ops != &global_ops || !global_start_up)
 		ops->flags &= ~FTRACE_OPS_FL_ENABLED;
 
-	if (!ftrace_start_up)
-		command |= FTRACE_DISABLE_CALLS;
+	command |= FTRACE_UPDATE_CALLS;
 
 	if (saved_ftrace_func != ftrace_trace_function) {
 		saved_ftrace_func = ftrace_trace_function;
@@ -1766,7 +1765,7 @@ static void ftrace_startup_sysctl(void)
 	saved_ftrace_func = NULL;
 	/* ftrace_start_up is true if we want ftrace running */
 	if (ftrace_start_up)
-		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+		ftrace_run_update_code(FTRACE_UPDATE_CALLS);
 }
 
 static void ftrace_shutdown_sysctl(void)
@@ -2919,7 +2918,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
 	ret = ftrace_hash_move(ops, enable, orig_hash, hash);
 	if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
 	    && ftrace_enabled)
-		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+		ftrace_run_update_code(FTRACE_UPDATE_CALLS);
 
 	mutex_unlock(&ftrace_lock);
 
@@ -3107,7 +3106,7 @@ ftrace_regex_release(struct inode *inode, struct file *file)
 				       orig_hash, iter->hash);
 		if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
 		    && ftrace_enabled)
-			ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+			ftrace_run_update_code(FTRACE_UPDATE_CALLS);
 
 		mutex_unlock(&ftrace_lock);
 	}
-- 
cgit v1.2.3-71-gd317


From c88fd8634ea68e74c7d19fd2621b4078fd22864c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 16 Aug 2011 09:53:39 -0400
Subject: ftrace: Allow archs to modify code without stop machine

The stop machine method to modify all functions in the kernel
(some 20,000 of them) is the safest way to do so across all archs.
But some archs may not need this big hammer approach to modify code
on SMP machines, and can simply just update the code it needs.

Adding a weak function arch_ftrace_update_code() that now does the
stop machine, will also let any arch override this method.

If the arch needs to check the system and then decide if it can
avoid stop machine, it can still call ftrace_run_stop_machine() to
use the old method.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  31 ++++++
 kernel/trace/ftrace.c  | 253 +++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 246 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 26eafcef75be..4f0b6fec379d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -133,6 +133,8 @@ struct ftrace_func_command {
 int ftrace_arch_code_modify_prepare(void);
 int ftrace_arch_code_modify_post_process(void);
 
+void ftrace_bug(int err, unsigned long ip);
+
 struct seq_file;
 
 struct ftrace_probe_ops {
@@ -190,6 +192,35 @@ void ftrace_set_global_notrace(unsigned char *buf, int len, int reset);
 int register_ftrace_command(struct ftrace_func_command *cmd);
 int unregister_ftrace_command(struct ftrace_func_command *cmd);
 
+enum {
+	FTRACE_UPDATE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_START_FUNC_RET		= (1 << 3),
+	FTRACE_STOP_FUNC_RET		= (1 << 4),
+};
+
+enum {
+	FTRACE_UPDATE_IGNORE,
+	FTRACE_UPDATE_MAKE_CALL,
+	FTRACE_UPDATE_MAKE_NOP,
+};
+
+void arch_ftrace_update_code(int command);
+
+struct ftrace_rec_iter;
+
+struct ftrace_rec_iter *ftrace_rec_iter_start(void);
+struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter);
+struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter);
+
+int ftrace_update_record(struct dyn_ftrace *rec, int enable);
+int ftrace_test_record(struct dyn_ftrace *rec, int enable);
+void ftrace_run_stop_machine(int command);
+int ftrace_location(unsigned long ip);
+
+extern ftrace_func_t ftrace_trace_function;
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 25b4f4da0fe8..655b432fb890 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -947,13 +947,6 @@ struct ftrace_func_probe {
 	struct rcu_head		rcu;
 };
 
-enum {
-	FTRACE_UPDATE_CALLS		= (1 << 0),
-	FTRACE_DISABLE_CALLS		= (1 << 1),
-	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
-	FTRACE_START_FUNC_RET		= (1 << 3),
-	FTRACE_STOP_FUNC_RET		= (1 << 4),
-};
 struct ftrace_func_entry {
 	struct hlist_node hlist;
 	unsigned long ip;
@@ -1307,6 +1300,28 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 		}				\
 	}
 
+/**
+ * ftrace_location - return true if the ip giving is a traced location
+ * @ip: the instruction pointer to check
+ *
+ * Returns 1 if @ip given is a pointer to a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+int ftrace_location(unsigned long ip)
+{
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+
+	do_for_each_ftrace_rec(pg, rec) {
+		if (rec->ip == ip)
+			return 1;
+	} while_for_each_ftrace_rec();
+
+	return 0;
+}
+
 static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 				     int filter_hash,
 				     bool inc)
@@ -1475,7 +1490,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
 		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
 }
 
-static void ftrace_bug(int failed, unsigned long ip)
+/**
+ * ftrace_bug - report and shutdown function tracer
+ * @failed: The failed type (EFAULT, EINVAL, EPERM)
+ * @ip: The address that failed
+ *
+ * The arch code that enables or disables the function tracing
+ * can call ftrace_bug() when it has detected a problem in
+ * modifying the code. @failed should be one of either:
+ * EFAULT - if the problem happens on reading the @ip address
+ * EINVAL - if what is read at @ip is not what was expected
+ * EPERM - if the problem happens on writting to the @ip address
+ */
+void ftrace_bug(int failed, unsigned long ip)
 {
 	switch (failed) {
 	case -EFAULT:
@@ -1517,15 +1544,10 @@ int ftrace_text_reserved(void *start, void *end)
 	return 0;
 }
 
-
-static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int update)
+static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 {
-	unsigned long ftrace_addr;
 	unsigned long flag = 0UL;
 
-	ftrace_addr = (unsigned long)FTRACE_ADDR;
-
 	/*
 	 * If we are updating calls:
 	 *
@@ -1537,20 +1559,74 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int update)
 	 * If we are disabling calls, then disable all records that
 	 * are enabled.
 	 */
-	if (update && (rec->flags & ~FTRACE_FL_MASK))
+	if (enable && (rec->flags & ~FTRACE_FL_MASK))
 		flag = FTRACE_FL_ENABLED;
 
 	/* If the state of this record hasn't changed, then do nothing */
 	if ((rec->flags & FTRACE_FL_ENABLED) == flag)
-		return 0;
+		return FTRACE_UPDATE_IGNORE;
 
 	if (flag) {
-		rec->flags |= FTRACE_FL_ENABLED;
+		if (update)
+			rec->flags |= FTRACE_FL_ENABLED;
+		return FTRACE_UPDATE_MAKE_CALL;
+	}
+
+	if (update)
+		rec->flags &= ~FTRACE_FL_ENABLED;
+
+	return FTRACE_UPDATE_MAKE_NOP;
+}
+
+/**
+ * ftrace_update_record, set a record that now is tracing or not
+ * @rec: the record to update
+ * @enable: set to 1 if the record is tracing, zero to force disable
+ *
+ * The records that represent all functions that can be traced need
+ * to be updated when tracing has been enabled.
+ */
+int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+{
+	return ftrace_check_record(rec, enable, 1);
+}
+
+/**
+ * ftrace_test_record, check if the record has been enabled or not
+ * @rec: the record to test
+ * @enable: set to 1 to check if enabled, 0 if it is disabled
+ *
+ * The arch code may need to test if a record is already set to
+ * tracing to determine how to modify the function code that it
+ * represents.
+ */
+int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+{
+	return ftrace_check_record(rec, enable, 0);
+}
+
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+{
+	unsigned long ftrace_addr;
+	int ret;
+
+	ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+	ret = ftrace_update_record(rec, enable);
+
+	switch (ret) {
+	case FTRACE_UPDATE_IGNORE:
+		return 0;
+
+	case FTRACE_UPDATE_MAKE_CALL:
 		return ftrace_make_call(rec, ftrace_addr);
+
+	case FTRACE_UPDATE_MAKE_NOP:
+		return ftrace_make_nop(NULL, rec, ftrace_addr);
 	}
 
-	rec->flags &= ~FTRACE_FL_ENABLED;
-	return ftrace_make_nop(NULL, rec, ftrace_addr);
+	return -1; /* unknow ftrace bug */
 }
 
 static void ftrace_replace_code(int update)
@@ -1576,6 +1652,78 @@ static void ftrace_replace_code(int update)
 	} while_for_each_ftrace_rec();
 }
 
+struct ftrace_rec_iter {
+	struct ftrace_page	*pg;
+	int			index;
+};
+
+/**
+ * ftrace_rec_iter_start, start up iterating over traced functions
+ *
+ * Returns an iterator handle that is used to iterate over all
+ * the records that represent address locations where functions
+ * are traced.
+ *
+ * May return NULL if no records are available.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_start(void)
+{
+	/*
+	 * We only use a single iterator.
+	 * Protected by the ftrace_lock mutex.
+	 */
+	static struct ftrace_rec_iter ftrace_rec_iter;
+	struct ftrace_rec_iter *iter = &ftrace_rec_iter;
+
+	iter->pg = ftrace_pages_start;
+	iter->index = 0;
+
+	/* Could have empty pages */
+	while (iter->pg && !iter->pg->index)
+		iter->pg = iter->pg->next;
+
+	if (!iter->pg)
+		return NULL;
+
+	return iter;
+}
+
+/**
+ * ftrace_rec_iter_next, get the next record to process.
+ * @iter: The handle to the iterator.
+ *
+ * Returns the next iterator after the given iterator @iter.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
+{
+	iter->index++;
+
+	if (iter->index >= iter->pg->index) {
+		iter->pg = iter->pg->next;
+		iter->index = 0;
+
+		/* Could have empty pages */
+		while (iter->pg && !iter->pg->index)
+			iter->pg = iter->pg->next;
+	}
+
+	if (!iter->pg)
+		return NULL;
+
+	return iter;
+}
+
+/**
+ * ftrace_rec_iter_record, get the record at the iterator location
+ * @iter: The current iterator location
+ *
+ * Returns the record that the current @iter is at.
+ */
+struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
+{
+	return &iter->pg->records[iter->index];
+}
+
 static int
 ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 {
@@ -1617,12 +1765,6 @@ static int __ftrace_modify_code(void *data)
 {
 	int *command = data;
 
-	/*
-	 * Do not call function tracer while we update the code.
-	 * We are in stop machine, no worrying about races.
-	 */
-	function_trace_stop++;
-
 	if (*command & FTRACE_UPDATE_CALLS)
 		ftrace_replace_code(1);
 	else if (*command & FTRACE_DISABLE_CALLS)
@@ -1636,21 +1778,33 @@ static int __ftrace_modify_code(void *data)
 	else if (*command & FTRACE_STOP_FUNC_RET)
 		ftrace_disable_ftrace_graph_caller();
 
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-	/*
-	 * For archs that call ftrace_test_stop_func(), we must
-	 * wait till after we update all the function callers
-	 * before we update the callback. This keeps different
-	 * ops that record different functions from corrupting
-	 * each other.
-	 */
-	__ftrace_trace_function = __ftrace_trace_function_delay;
-#endif
-	function_trace_stop--;
-
 	return 0;
 }
 
+/**
+ * ftrace_run_stop_machine, go back to the stop machine method
+ * @command: The command to tell ftrace what to do
+ *
+ * If an arch needs to fall back to the stop machine method, the
+ * it can call this function.
+ */
+void ftrace_run_stop_machine(int command)
+{
+	stop_machine(__ftrace_modify_code, &command, NULL);
+}
+
+/**
+ * arch_ftrace_update_code, modify the code to trace or not trace
+ * @command: The command that needs to be done
+ *
+ * Archs can override this function if it does not need to
+ * run stop_machine() to modify code.
+ */
+void __weak arch_ftrace_update_code(int command)
+{
+	ftrace_run_stop_machine(command);
+}
+
 static void ftrace_run_update_code(int command)
 {
 	int ret;
@@ -1659,8 +1813,31 @@ static void ftrace_run_update_code(int command)
 	FTRACE_WARN_ON(ret);
 	if (ret)
 		return;
+	/*
+	 * Do not call function tracer while we update the code.
+	 * We are in stop machine.
+	 */
+	function_trace_stop++;
 
-	stop_machine(__ftrace_modify_code, &command, NULL);
+	/*
+	 * By default we use stop_machine() to modify the code.
+	 * But archs can do what ever they want as long as it
+	 * is safe. The stop_machine() is the safest, but also
+	 * produces the most overhead.
+	 */
+	arch_ftrace_update_code(command);
+
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	/*
+	 * For archs that call ftrace_test_stop_func(), we must
+	 * wait till after we update all the function callers
+	 * before we update the callback. This keeps different
+	 * ops that record different functions from corrupting
+	 * each other.
+	 */
+	__ftrace_trace_function = __ftrace_trace_function_delay;
+#endif
+	function_trace_stop--;
 
 	ret = ftrace_arch_code_modify_post_process();
 	FTRACE_WARN_ON(ret);
-- 
cgit v1.2.3-71-gd317


From 3208230983a0ee3d95be22d463257e530c684956 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 14:42:37 -0500
Subject: ftrace: Remove usage of "freed" records

Records that are added to the function trace table are
permanently there, except for modules. By separating out the
modules to their own pages that can be freed in one shot
we can remove the "freed" flag and simplify some of the record
management.

Another benefit of doing this is that we can also move the
records around; sort them.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |   1 -
 kernel/trace/ftrace.c  | 100 ++++++++++++++++++++++++-------------------------
 2 files changed, 49 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4f0b6fec379d..3f79bc458bff 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -163,7 +163,6 @@ extern int ftrace_text_reserved(void *start, void *end);
 
 enum {
 	FTRACE_FL_ENABLED	= (1 << 30),
-	FTRACE_FL_FREE		= (1 << 31),
 };
 
 #define FTRACE_FL_MASK		(0x3UL << 30)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 655b432fb890..be6888f40d2b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -996,8 +996,6 @@ struct ftrace_page {
 static struct ftrace_page	*ftrace_pages_start;
 static struct ftrace_page	*ftrace_pages;
 
-static struct dyn_ftrace *ftrace_free_records;
-
 static struct ftrace_func_entry *
 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 {
@@ -1421,32 +1419,8 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
 	__ftrace_hash_rec_update(ops, filter_hash, 1);
 }
 
-static void ftrace_free_rec(struct dyn_ftrace *rec)
-{
-	rec->freelist = ftrace_free_records;
-	ftrace_free_records = rec;
-	rec->flags |= FTRACE_FL_FREE;
-}
-
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
-	struct dyn_ftrace *rec;
-
-	/* First check for freed records */
-	if (ftrace_free_records) {
-		rec = ftrace_free_records;
-
-		if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
-			FTRACE_WARN_ON_ONCE(1);
-			ftrace_free_records = NULL;
-			return NULL;
-		}
-
-		ftrace_free_records = rec->freelist;
-		memset(rec, 0, sizeof(*rec));
-		return rec;
-	}
-
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
 		if (!ftrace_pages->next) {
 			/* allocate another page */
@@ -1639,10 +1613,6 @@ static void ftrace_replace_code(int update)
 		return;
 
 	do_for_each_ftrace_rec(pg, rec) {
-		/* Skip over free records */
-		if (rec->flags & FTRACE_FL_FREE)
-			continue;
-
 		failed = __ftrace_replace_code(rec, update);
 		if (failed) {
 			ftrace_bug(failed, rec->ip);
@@ -2007,11 +1977,8 @@ static int ftrace_update_code(struct module *mod)
 		 * Do the initial record conversion from mcount jump
 		 * to the NOP instructions.
 		 */
-		if (!ftrace_code_disable(mod, p)) {
-			ftrace_free_rec(p);
-			/* Game over */
+		if (!ftrace_code_disable(mod, p))
 			break;
-		}
 
 		ftrace_update_cnt++;
 
@@ -2026,10 +1993,8 @@ static int ftrace_update_code(struct module *mod)
 		 */
 		if (ftrace_start_up && ref) {
 			int failed = __ftrace_replace_code(p, 1);
-			if (failed) {
+			if (failed)
 				ftrace_bug(failed, p->ip);
-				ftrace_free_rec(p);
-			}
 		}
 	}
 
@@ -2223,9 +2188,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		}
 	} else {
 		rec = &iter->pg->records[iter->idx++];
-		if ((rec->flags & FTRACE_FL_FREE) ||
-
-		    ((iter->flags & FTRACE_ITER_FILTER) &&
+		if (((iter->flags & FTRACE_ITER_FILTER) &&
 		     !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
 
 		    ((iter->flags & FTRACE_ITER_NOTRACE) &&
@@ -2602,7 +2565,6 @@ match_records(struct ftrace_hash *hash, char *buff,
 		goto out_unlock;
 
 	do_for_each_ftrace_rec(pg, rec) {
-
 		if (ftrace_match_record(rec, mod, search, search_len, type)) {
 			ret = enter_record(hash, rec, not);
 			if (ret < 0) {
@@ -3446,9 +3408,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 
 	do_for_each_ftrace_rec(pg, rec) {
 
-		if (rec->flags & FTRACE_FL_FREE)
-			continue;
-
 		if (ftrace_match_record(rec, NULL, search, search_len, type)) {
 			/* if it is in the array */
 			exists = false;
@@ -3566,6 +3525,27 @@ static int ftrace_process_locs(struct module *mod,
 	unsigned long flags = 0; /* Shut up gcc */
 
 	mutex_lock(&ftrace_lock);
+	/*
+	 * Core and each module needs their own pages, as
+	 * modules will free them when they are removed.
+	 * Force a new page to be allocated for modules.
+	 */
+	if (mod) {
+		if (!ftrace_pages)
+			return -ENOMEM;
+
+		/*
+		 * If the last page was full, it will be
+		 * allocated anyway.
+		 */
+		if (ftrace_pages->index != ENTRIES_PER_PAGE) {
+			ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+			if (!ftrace_pages->next)
+				return -ENOMEM;
+			ftrace_pages = ftrace_pages->next;
+		}
+	}
+
 	p = start;
 	while (p < end) {
 		addr = ftrace_call_adjust(*p++);
@@ -3599,9 +3579,13 @@ static int ftrace_process_locs(struct module *mod,
 }
 
 #ifdef CONFIG_MODULES
+
+#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+
 void ftrace_release_mod(struct module *mod)
 {
 	struct dyn_ftrace *rec;
+	struct ftrace_page **last_pg;
 	struct ftrace_page *pg;
 
 	mutex_lock(&ftrace_lock);
@@ -3609,16 +3593,30 @@ void ftrace_release_mod(struct module *mod)
 	if (ftrace_disabled)
 		goto out_unlock;
 
-	do_for_each_ftrace_rec(pg, rec) {
+	/*
+	 * Each module has its own ftrace_pages, remove
+	 * them from the list.
+	 */
+	last_pg = &ftrace_pages_start;
+	for (pg = ftrace_pages_start; pg; pg = *last_pg) {
+		rec = &pg->records[0];
 		if (within_module_core(rec->ip, mod)) {
 			/*
-			 * rec->ip is changed in ftrace_free_rec()
-			 * It should not between s and e if record was freed.
+			 * As core pages are first, the first
+			 * page should never be a module page.
 			 */
-			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-			ftrace_free_rec(rec);
-		}
-	} while_for_each_ftrace_rec();
+			if (WARN_ON(pg == ftrace_pages_start))
+				goto out_unlock;
+
+			/* Check if we are deleting the last page */
+			if (pg == ftrace_pages)
+				ftrace_pages = next_to_ftrace_page(last_pg);
+
+			*last_pg = pg->next;
+			free_page((unsigned long)pg);
+		} else
+			last_pg = &pg->next;
+	}
  out_unlock:
 	mutex_unlock(&ftrace_lock);
 }
-- 
cgit v1.2.3-71-gd317


From a79008755497daff157f5294c02e3b940641cc11 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 16:23:44 -0500
Subject: ftrace: Allocate the mcount record pages as groups

Allocate the mcount record pages as a group of pages as big
as can be allocated and waste no more than a single page.

Grouping the mcount pages as much as possible helps with cache
locality, as we do not need to redirect with descriptors as we
cross from page to page. It also allows us to do more with the
records later on (sort them with bigger benefits).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 179 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 128 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index be6888f40d2b..2e7218869fe9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -983,12 +983,13 @@ static DEFINE_MUTEX(ftrace_regex_lock);
 
 struct ftrace_page {
 	struct ftrace_page	*next;
+	struct dyn_ftrace	*records;
 	int			index;
-	struct dyn_ftrace	records[];
+	int			size;
 };
 
-#define ENTRIES_PER_PAGE \
-  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+#define ENTRY_SIZE sizeof(struct dyn_ftrace)
+#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
 
 /* estimate from running different kernels */
 #define NR_TO_INIT		10000
@@ -1421,14 +1422,10 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
 
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
-	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
-		if (!ftrace_pages->next) {
-			/* allocate another page */
-			ftrace_pages->next =
-				(void *)get_zeroed_page(GFP_KERNEL);
-			if (!ftrace_pages->next)
-				return NULL;
-		}
+	if (ftrace_pages->index == ftrace_pages->size) {
+		/* We should have allocated enough */
+		if (WARN_ON(!ftrace_pages->next))
+			return NULL;
 		ftrace_pages = ftrace_pages->next;
 	}
 
@@ -2005,47 +2002,106 @@ static int ftrace_update_code(struct module *mod)
 	return 0;
 }
 
-static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
+static int ftrace_allocate_records(struct ftrace_page *pg, int count)
 {
-	struct ftrace_page *pg;
+	int order;
 	int cnt;
-	int i;
 
-	/* allocate a few pages */
-	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!ftrace_pages_start)
-		return -1;
+	if (WARN_ON(!count))
+		return -EINVAL;
+
+	order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
 
 	/*
-	 * Allocate a few more pages.
-	 *
-	 * TODO: have some parser search vmlinux before
-	 *   final linking to find all calls to ftrace.
-	 *   Then we can:
-	 *    a) know how many pages to allocate.
-	 *     and/or
-	 *    b) set up the table then.
-	 *
-	 *  The dynamic code is still necessary for
-	 *  modules.
+	 * We want to fill as much as possible. No more than a page
+	 * may be empty.
 	 */
+	while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
+		order--;
 
-	pg = ftrace_pages = ftrace_pages_start;
+ again:
+	pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
 
-	cnt = num_to_init / ENTRIES_PER_PAGE;
-	pr_info("ftrace: allocating %ld entries in %d pages\n",
-		num_to_init, cnt + 1);
+	if (!pg->records) {
+		/* if we can't allocate this size, try something smaller */
+		if (!order)
+			return -ENOMEM;
+		order >>= 1;
+		goto again;
+	}
 
-	for (i = 0; i < cnt; i++) {
-		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+	cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
+	pg->size = cnt;
 
-		/* If we fail, we'll try later anyway */
-		if (!pg->next)
+	if (cnt > count)
+		cnt = count;
+
+	return cnt;
+}
+
+static struct ftrace_page *
+ftrace_allocate_pages(unsigned long num_to_init)
+{
+	struct ftrace_page *start_pg;
+	struct ftrace_page *pg;
+	int order;
+	int cnt;
+
+	if (!num_to_init)
+		return 0;
+
+	start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
+	if (!pg)
+		return NULL;
+
+	/*
+	 * Try to allocate as much as possible in one continues
+	 * location that fills in all of the space. We want to
+	 * waste as little space as possible.
+	 */
+	for (;;) {
+		cnt = ftrace_allocate_records(pg, num_to_init);
+		if (cnt < 0)
+			goto free_pages;
+
+		num_to_init -= cnt;
+		if (!num_to_init)
 			break;
 
+		pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
+		if (!pg->next)
+			goto free_pages;
+
 		pg = pg->next;
 	}
 
+	return start_pg;
+
+ free_pages:
+	while (start_pg) {
+		order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+		free_pages((unsigned long)pg->records, order);
+		start_pg = pg->next;
+		kfree(pg);
+		pg = start_pg;
+	}
+	pr_info("ftrace: FAILED to allocate memory for functions\n");
+	return NULL;
+}
+
+static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
+{
+	int cnt;
+
+	if (!num_to_init) {
+		pr_info("ftrace: No functions to be traced?\n");
+		return -1;
+	}
+
+	cnt = num_to_init / ENTRIES_PER_PAGE;
+	pr_info("ftrace: allocating %ld entries in %d pages\n",
+		num_to_init, cnt + 1);
+
 	return 0;
 }
 
@@ -3520,30 +3576,45 @@ static int ftrace_process_locs(struct module *mod,
 			       unsigned long *start,
 			       unsigned long *end)
 {
+	struct ftrace_page *pg;
+	unsigned long count;
 	unsigned long *p;
 	unsigned long addr;
 	unsigned long flags = 0; /* Shut up gcc */
+	int ret = -ENOMEM;
+
+	count = end - start;
+
+	if (!count)
+		return 0;
+
+	pg = ftrace_allocate_pages(count);
+	if (!pg)
+		return -ENOMEM;
 
 	mutex_lock(&ftrace_lock);
+
 	/*
 	 * Core and each module needs their own pages, as
 	 * modules will free them when they are removed.
 	 * Force a new page to be allocated for modules.
 	 */
-	if (mod) {
+	if (!mod) {
+		WARN_ON(ftrace_pages || ftrace_pages_start);
+		/* First initialization */
+		ftrace_pages = ftrace_pages_start = pg;
+	} else {
 		if (!ftrace_pages)
-			return -ENOMEM;
+			goto out;
 
-		/*
-		 * If the last page was full, it will be
-		 * allocated anyway.
-		 */
-		if (ftrace_pages->index != ENTRIES_PER_PAGE) {
-			ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
-			if (!ftrace_pages->next)
-				return -ENOMEM;
-			ftrace_pages = ftrace_pages->next;
+		if (WARN_ON(ftrace_pages->next)) {
+			/* Hmm, we have free pages? */
+			while (ftrace_pages->next)
+				ftrace_pages = ftrace_pages->next;
 		}
+
+		ftrace_pages->next = pg;
+		ftrace_pages = pg;
 	}
 
 	p = start;
@@ -3557,7 +3628,8 @@ static int ftrace_process_locs(struct module *mod,
 		 */
 		if (!addr)
 			continue;
-		ftrace_record_ip(addr);
+		if (!ftrace_record_ip(addr))
+			break;
 	}
 
 	/*
@@ -3573,9 +3645,11 @@ static int ftrace_process_locs(struct module *mod,
 	ftrace_update_code(mod);
 	if (!mod)
 		local_irq_restore(flags);
+	ret = 0;
+ out:
 	mutex_unlock(&ftrace_lock);
 
-	return 0;
+	return ret;
 }
 
 #ifdef CONFIG_MODULES
@@ -3587,6 +3661,7 @@ void ftrace_release_mod(struct module *mod)
 	struct dyn_ftrace *rec;
 	struct ftrace_page **last_pg;
 	struct ftrace_page *pg;
+	int order;
 
 	mutex_lock(&ftrace_lock);
 
@@ -3613,7 +3688,9 @@ void ftrace_release_mod(struct module *mod)
 				ftrace_pages = next_to_ftrace_page(last_pg);
 
 			*last_pg = pg->next;
-			free_page((unsigned long)pg);
+			order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+			free_pages((unsigned long)pg->records, order);
+			kfree(pg);
 		} else
 			last_pg = &pg->next;
 	}
-- 
cgit v1.2.3-71-gd317


From 85ae32ae019bc1c2cc22e5f51fe0c9f2812ef68c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 16:30:31 -0500
Subject: ftrace: Replace record newlist with record page list

As new functions come in to be initalized from mcount to nop,
they are done by groups of pages. Whether it is the core kernel
or a module. There's no need to keep track of these on a per record
basis.

At startup, and as any module is loaded, the functions to be
traced are stored in a group of pages and added to the function
list at the end. We just need to keep a pointer to the first
page of the list that was added, and use that to know where to
start on the list for initializing functions.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  5 +---
 kernel/trace/ftrace.c  | 68 +++++++++++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3f79bc458bff..31b9fd7aedcd 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -173,10 +173,7 @@ struct dyn_ftrace {
 		unsigned long		ip; /* address of mcount call-site */
 		struct dyn_ftrace	*freelist;
 	};
-	union {
-		unsigned long		flags;
-		struct dyn_ftrace	*newlist;
-	};
+	unsigned long		flags;
 	struct dyn_arch_ftrace		arch;
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2e7218869fe9..366d7881f188 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -977,8 +977,6 @@ static struct ftrace_ops global_ops = {
 	.filter_hash		= EMPTY_HASH,
 };
 
-static struct dyn_ftrace *ftrace_new_addrs;
-
 static DEFINE_MUTEX(ftrace_regex_lock);
 
 struct ftrace_page {
@@ -988,6 +986,8 @@ struct ftrace_page {
 	int			size;
 };
 
+static struct ftrace_page *ftrace_new_pgs;
+
 #define ENTRY_SIZE sizeof(struct dyn_ftrace)
 #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
 
@@ -1445,8 +1445,6 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-	rec->newlist = ftrace_new_addrs;
-	ftrace_new_addrs = rec;
 
 	return rec;
 }
@@ -1936,9 +1934,11 @@ static int ops_traces_mod(struct ftrace_ops *ops)
 
 static int ftrace_update_code(struct module *mod)
 {
+	struct ftrace_page *pg;
 	struct dyn_ftrace *p;
 	cycle_t start, stop;
 	unsigned long ref = 0;
+	int i;
 
 	/*
 	 * When adding a module, we need to check if tracers are
@@ -1960,41 +1960,44 @@ static int ftrace_update_code(struct module *mod)
 	start = ftrace_now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
 
-	while (ftrace_new_addrs) {
+	for (pg = ftrace_new_pgs; pg; pg = pg->next) {
 
-		/* If something went wrong, bail without enabling anything */
-		if (unlikely(ftrace_disabled))
-			return -1;
+		for (i = 0; i < pg->index; i++) {
+			/* If something went wrong, bail without enabling anything */
+			if (unlikely(ftrace_disabled))
+				return -1;
 
-		p = ftrace_new_addrs;
-		ftrace_new_addrs = p->newlist;
-		p->flags = ref;
+			p = &pg->records[i];
+			p->flags = ref;
 
-		/*
-		 * Do the initial record conversion from mcount jump
-		 * to the NOP instructions.
-		 */
-		if (!ftrace_code_disable(mod, p))
-			break;
+			/*
+			 * Do the initial record conversion from mcount jump
+			 * to the NOP instructions.
+			 */
+			if (!ftrace_code_disable(mod, p))
+				break;
 
-		ftrace_update_cnt++;
+			ftrace_update_cnt++;
 
-		/*
-		 * If the tracing is enabled, go ahead and enable the record.
-		 *
-		 * The reason not to enable the record immediatelly is the
-		 * inherent check of ftrace_make_nop/ftrace_make_call for
-		 * correct previous instructions.  Making first the NOP
-		 * conversion puts the module to the correct state, thus
-		 * passing the ftrace_make_call check.
-		 */
-		if (ftrace_start_up && ref) {
-			int failed = __ftrace_replace_code(p, 1);
-			if (failed)
-				ftrace_bug(failed, p->ip);
+			/*
+			 * If the tracing is enabled, go ahead and enable the record.
+			 *
+			 * The reason not to enable the record immediatelly is the
+			 * inherent check of ftrace_make_nop/ftrace_make_call for
+			 * correct previous instructions.  Making first the NOP
+			 * conversion puts the module to the correct state, thus
+			 * passing the ftrace_make_call check.
+			 */
+			if (ftrace_start_up && ref) {
+				int failed = __ftrace_replace_code(p, 1);
+				if (failed)
+					ftrace_bug(failed, p->ip);
+			}
 		}
 	}
 
+	ftrace_new_pgs = NULL;
+
 	stop = ftrace_now(raw_smp_processor_id());
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += ftrace_update_cnt;
@@ -3632,6 +3635,9 @@ static int ftrace_process_locs(struct module *mod,
 			break;
 	}
 
+	/* These new locations need to be initialized */
+	ftrace_new_pgs = pg;
+
 	/*
 	 * We only need to disable interrupts on start up
 	 * because we are modifying code that an interrupt
-- 
cgit v1.2.3-71-gd317


From 68950619f8c82e468d8976130462617abea605a8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 17:06:45 -0500
Subject: ftrace: Sort the mcount records on each page

Sort records by ip locations of the ftrace mcount calls on each of the
set of pages in the function list. This helps in localizing cache
usuage when updating the function locations, as well as gives us
the ability to quickly find an ip location in the list.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 366d7881f188..2d6f8bcd1884 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
 #include <linux/sysctl.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
+#include <linux/sort.h>
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/rcupdate.h>
@@ -3575,6 +3576,29 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 	return 0;
 }
 
+static void ftrace_swap_recs(void *a, void *b, int size)
+{
+	struct dyn_ftrace *reca = a;
+	struct dyn_ftrace *recb = b;
+	struct dyn_ftrace t;
+
+	t = *reca;
+	*reca = *recb;
+	*recb = t;
+}
+
+static int ftrace_cmp_recs(const void *a, const void *b)
+{
+	const struct dyn_ftrace *reca = a;
+	const struct dyn_ftrace *recb = b;
+
+	if (reca->ip > recb->ip)
+		return 1;
+	if (reca->ip < recb->ip)
+		return -1;
+	return 0;
+}
+
 static int ftrace_process_locs(struct module *mod,
 			       unsigned long *start,
 			       unsigned long *end)
@@ -3638,6 +3662,11 @@ static int ftrace_process_locs(struct module *mod,
 	/* These new locations need to be initialized */
 	ftrace_new_pgs = pg;
 
+	/* Make each individual set of pages sorted by ips */
+	for (; pg; pg = pg->next)
+		sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
+		     ftrace_cmp_recs, ftrace_swap_recs);
+
 	/*
 	 * We only need to disable interrupts on start up
 	 * because we are modifying code that an interrupt
-- 
cgit v1.2.3-71-gd317


From 5855fead9cc358adebd6bdeec202d040c623ae38 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 19:27:42 -0500
Subject: ftrace: Use bsearch to find record ip

Now that each set of pages in the function list are sorted by
ip, we can use bsearch to find a record within each set of pages.
This speeds up the ftrace_location() function by magnitudes.

For archs (like x86) that need to add a breakpoint at every function
that will be converted from a nop to a callback and vice versa,
the breakpoint callback needs to know if the breakpoint was for
ftrace or not. It requires finding the breakpoint ip within the
records. Doing a linear search is extremely inefficient. It is
a must to be able to do a fast binary search to find these locations.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2d6f8bcd1884..dcd3a814d39b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,6 +22,7 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
+#include <linux/bsearch.h>
 #include <linux/module.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
@@ -1300,6 +1301,19 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 		}				\
 	}
 
+
+static int ftrace_cmp_recs(const void *a, const void *b)
+{
+	const struct dyn_ftrace *reca = a;
+	const struct dyn_ftrace *recb = b;
+
+	if (reca->ip > recb->ip)
+		return 1;
+	if (reca->ip < recb->ip)
+		return -1;
+	return 0;
+}
+
 /**
  * ftrace_location - return true if the ip giving is a traced location
  * @ip: the instruction pointer to check
@@ -1313,11 +1327,17 @@ int ftrace_location(unsigned long ip)
 {
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
+	struct dyn_ftrace key;
 
-	do_for_each_ftrace_rec(pg, rec) {
-		if (rec->ip == ip)
+	key.ip = ip;
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		rec = bsearch(&key, pg->records, pg->index,
+			      sizeof(struct dyn_ftrace),
+			      ftrace_cmp_recs);
+		if (rec)
 			return 1;
-	} while_for_each_ftrace_rec();
+	}
 
 	return 0;
 }
@@ -3587,18 +3607,6 @@ static void ftrace_swap_recs(void *a, void *b, int size)
 	*recb = t;
 }
 
-static int ftrace_cmp_recs(const void *a, const void *b)
-{
-	const struct dyn_ftrace *reca = a;
-	const struct dyn_ftrace *recb = b;
-
-	if (reca->ip > recb->ip)
-		return 1;
-	if (reca->ip < recb->ip)
-		return -1;
-	return 0;
-}
-
 static int ftrace_process_locs(struct module *mod,
 			       unsigned long *start,
 			       unsigned long *end)
-- 
cgit v1.2.3-71-gd317


From c842e975520f8ab09e293cc92f51a1f396251fd5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 18:44:44 -0500
Subject: ftrace: Fix ftrace hash record update with notrace

When disabling the "notrace" records, that means we want to trace them.
If the notrace_hash is zero, it means that we want to trace all
records. But to disable a zero notrace_hash means nothing.

The check for the notrace_hash count was incorrect with:

	if (hash && !hash->count)
		return

With the correct comment above it that states that we do nothing
if the notrace_hash has zero count. But !hash also means that
the notrace hash has zero count. I think this was done to
protect against dereferencing NULL. But if !hash is true, then
we go through the following loop without doing a single thing.

Fix it to:

	if (!hash || !hash->count)
		return;

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index dcd3a814d39b..a383d6c67bfa 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1381,7 +1381,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		 * If the notrace hash has no items,
 		 * then there's nothing to do.
 		 */
-		if (hash && !hash->count)
+		if (!hash || !hash->count)
 			return;
 	}
 
-- 
cgit v1.2.3-71-gd317


From 06a51d9307380c78bb5c92e68fc80ad2c7d7f890 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 19:07:36 -0500
Subject: ftrace: Create ftrace_hash_empty() helper routine

There are two types of hashes in the ftrace_ops; one type
is the filter_hash and the other is the notrace_hash. Either
one may be null, meaning it has no elements. But when elements
are added, the hash is allocated.

Throughout the code, a check needs to be made to see if a hash
exists or the hash has elements, but the check if the hash exists
is usually missing causing the possible "NULL pointer dereference bug".

Add a helper routine called "ftrace_hash_empty()" that returns
true if the hash doesn't exist or its count is zero. As they mean
the same thing.

Last-bug-reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a383d6c67bfa..e1ee07f81ca2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -999,6 +999,11 @@ static struct ftrace_page *ftrace_new_pgs;
 static struct ftrace_page	*ftrace_pages_start;
 static struct ftrace_page	*ftrace_pages;
 
+static bool ftrace_hash_empty(struct ftrace_hash *hash)
+{
+	return !hash || !hash->count;
+}
+
 static struct ftrace_func_entry *
 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 {
@@ -1007,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 	struct hlist_head *hhd;
 	struct hlist_node *n;
 
-	if (!hash->count)
+	if (ftrace_hash_empty(hash))
 		return NULL;
 
 	if (hash->size_bits > 0)
@@ -1151,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
 		return NULL;
 
 	/* Empty hash? */
-	if (!hash || !hash->count)
+	if (ftrace_hash_empty(hash))
 		return new_hash;
 
 	size = 1 << hash->size_bits;
@@ -1276,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 	filter_hash = rcu_dereference_raw(ops->filter_hash);
 	notrace_hash = rcu_dereference_raw(ops->notrace_hash);
 
-	if ((!filter_hash || !filter_hash->count ||
+	if ((ftrace_hash_empty(filter_hash) ||
 	     ftrace_lookup_ip(filter_hash, ip)) &&
-	    (!notrace_hash || !notrace_hash->count ||
+	    (ftrace_hash_empty(notrace_hash) ||
 	     !ftrace_lookup_ip(notrace_hash, ip)))
 		ret = 1;
 	else
@@ -1371,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 	if (filter_hash) {
 		hash = ops->filter_hash;
 		other_hash = ops->notrace_hash;
-		if (!hash || !hash->count)
+		if (ftrace_hash_empty(hash))
 			all = 1;
 	} else {
 		inc = !inc;
@@ -1381,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		 * If the notrace hash has no items,
 		 * then there's nothing to do.
 		 */
-		if (!hash || !hash->count)
+		if (ftrace_hash_empty(hash))
 			return;
 	}
 
@@ -1398,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 			if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
 				match = 1;
 		} else {
-			in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
-			in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
+			in_hash = !!ftrace_lookup_ip(hash, rec->ip);
+			in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
 
 			/*
 			 *
@@ -1407,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
 			if (filter_hash && in_hash && !in_other_hash)
 				match = 1;
 			else if (!filter_hash && in_hash &&
-				 (in_other_hash || !other_hash->count))
+				 (in_other_hash || ftrace_hash_empty(other_hash)))
 				match = 1;
 		}
 		if (!match)
@@ -1950,7 +1955,7 @@ static int ops_traces_mod(struct ftrace_ops *ops)
 	struct ftrace_hash *hash;
 
 	hash = ops->filter_hash;
-	return !!(!hash || !hash->count);
+	return ftrace_hash_empty(hash);
 }
 
 static int ftrace_update_code(struct module *mod)
@@ -2320,7 +2325,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	 * off, we can short cut and just print out that all
 	 * functions are enabled.
 	 */
-	if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
+	if (iter->flags & FTRACE_ITER_FILTER &&
+	    ftrace_hash_empty(ops->filter_hash)) {
 		if (*pos > 0)
 			return t_hash_start(m, pos);
 		iter->flags |= FTRACE_ITER_PRINTALL;
-- 
cgit v1.2.3-71-gd317


From fc13cb0ce45296f331263a6034aa1814203e1ac3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 14:41:25 -0500
Subject: ftrace: Allow other users of function tracing to use the output
 listing

The function tracer is set up to allow any other subsystem (like perf)
to use it. Ftrace already has a way to list what functions are enabled
by the global_ops. It would be very helpful to let other users of
the function tracer to be able to use the same code.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 35 +++++++++++++++++++++++++++++++++++
 kernel/trace/ftrace.c  | 41 +++++++++++++++++++++++++----------------
 2 files changed, 60 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 31b9fd7aedcd..aa7559f0a224 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -202,6 +202,14 @@ enum {
 	FTRACE_UPDATE_MAKE_NOP,
 };
 
+enum {
+	FTRACE_ITER_FILTER	= (1 << 0),
+	FTRACE_ITER_NOTRACE	= (1 << 1),
+	FTRACE_ITER_PRINTALL	= (1 << 2),
+	FTRACE_ITER_HASH	= (1 << 3),
+	FTRACE_ITER_ENABLED	= (1 << 4),
+};
+
 void arch_ftrace_update_code(int command);
 
 struct ftrace_rec_iter;
@@ -217,6 +225,15 @@ int ftrace_location(unsigned long ip);
 
 extern ftrace_func_t ftrace_trace_function;
 
+int ftrace_regex_open(struct ftrace_ops *ops, int flag,
+		  struct inode *inode, struct file *file);
+ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
+			    size_t cnt, loff_t *ppos);
+ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
+			     size_t cnt, loff_t *ppos);
+loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin);
+int ftrace_regex_release(struct inode *inode, struct file *file);
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
@@ -311,6 +328,24 @@ static inline int ftrace_text_reserved(void *start, void *end)
 {
 	return 0;
 }
+
+/*
+ * Again users of functions that have ftrace_ops may not
+ * have them defined when ftrace is not enabled, but these
+ * functions may still be called. Use a macro instead of inline.
+ */
+#define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; })
+
+static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
+			    size_t cnt, loff_t *ppos) { return -ENODEV; }
+static inline ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
+			     size_t cnt, loff_t *ppos) { return -ENODEV; }
+static inline loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+{
+	return -ENODEV;
+}
+static inline int
+ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e1ee07f81ca2..5b105c5ddc0c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2134,14 +2134,6 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
 	return 0;
 }
 
-enum {
-	FTRACE_ITER_FILTER	= (1 << 0),
-	FTRACE_ITER_NOTRACE	= (1 << 1),
-	FTRACE_ITER_PRINTALL	= (1 << 2),
-	FTRACE_ITER_HASH	= (1 << 3),
-	FTRACE_ITER_ENABLED	= (1 << 4),
-};
-
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
@@ -2249,7 +2241,7 @@ static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
-	struct ftrace_ops *ops = &global_ops;
+	struct ftrace_ops *ops = iter->ops;
 	struct dyn_ftrace *rec = NULL;
 
 	if (unlikely(ftrace_disabled))
@@ -2305,7 +2297,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
-	struct ftrace_ops *ops = &global_ops;
+	struct ftrace_ops *ops = iter->ops;
 	void *p = NULL;
 	loff_t l;
 
@@ -2414,6 +2406,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 
 	iter->pg = ftrace_pages_start;
+	iter->ops = &global_ops;
 
 	ret = seq_open(file, &show_ftrace_seq_ops);
 	if (!ret) {
@@ -2442,6 +2435,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
 
 	iter->pg = ftrace_pages_start;
 	iter->flags = FTRACE_ITER_ENABLED;
+	iter->ops = &global_ops;
 
 	ret = seq_open(file, &show_ftrace_seq_ops);
 	if (!ret) {
@@ -2462,7 +2456,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
 	mutex_unlock(&ftrace_lock);
 }
 
-static int
+/**
+ * ftrace_regex_open - initialize function tracer filter files
+ * @ops: The ftrace_ops that hold the hash filters
+ * @flag: The type of filter to process
+ * @inode: The inode, usually passed in to your open routine
+ * @file: The file, usually passed in to your open routine
+ *
+ * ftrace_regex_open() initializes the filter files for the
+ * @ops. Depending on @flag it may process the filter hash or
+ * the notrace hash of @ops. With this called from the open
+ * routine, you can use ftrace_filter_write() for the write
+ * routine if @flag has FTRACE_ITER_FILTER set, or
+ * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
+ * ftrace_regex_lseek() should be used as the lseek routine, and
+ * release must call ftrace_regex_release().
+ */
+int
 ftrace_regex_open(struct ftrace_ops *ops, int flag,
 		  struct inode *inode, struct file *file)
 {
@@ -2542,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 				 inode, file);
 }
 
-static loff_t
+loff_t
 ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 {
 	loff_t ret;
@@ -3095,14 +3105,14 @@ out_unlock:
 	return ret;
 }
 
-static ssize_t
+ssize_t
 ftrace_filter_write(struct file *file, const char __user *ubuf,
 		    size_t cnt, loff_t *ppos)
 {
 	return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
 }
 
-static ssize_t
+ssize_t
 ftrace_notrace_write(struct file *file, const char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
@@ -3292,8 +3302,7 @@ static void __init set_ftrace_early_filters(void)
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 }
 
-static int
-ftrace_regex_release(struct inode *inode, struct file *file)
+int ftrace_regex_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
 	struct ftrace_iterator *iter;
-- 
cgit v1.2.3-71-gd317


From 69a3083c4a7df0322d97bb2b43a33cb12af8131a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 15:21:16 -0500
Subject: ftrace: Decouple hash items from showing filtered functions

The set_ftrace_filter shows "hashed" functions, which are functions
that are added with operations to them (like traceon and traceoff).

As other subsystems may be able to show what functions they are
using for function tracing, the hash items should no longer
be shown just because the FILTER flag is set. As they have nothing
to do with other subsystems filters.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  5 +++--
 kernel/trace/ftrace.c  | 16 ++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index aa7559f0a224..d1ff0de18970 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -206,8 +206,9 @@ enum {
 	FTRACE_ITER_FILTER	= (1 << 0),
 	FTRACE_ITER_NOTRACE	= (1 << 1),
 	FTRACE_ITER_PRINTALL	= (1 << 2),
-	FTRACE_ITER_HASH	= (1 << 3),
-	FTRACE_ITER_ENABLED	= (1 << 4),
+	FTRACE_ITER_DO_HASH	= (1 << 3),
+	FTRACE_ITER_HASH	= (1 << 4),
+	FTRACE_ITER_ENABLED	= (1 << 5),
 };
 
 void arch_ftrace_update_code(int command);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b105c5ddc0c..5728d9aa632e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2198,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 	void *p = NULL;
 	loff_t l;
 
+	if (!(iter->flags & FTRACE_ITER_DO_HASH))
+		return NULL;
+
 	if (iter->func_pos > *pos)
 		return NULL;
 
@@ -2343,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 			break;
 	}
 
-	if (!p) {
-		if (iter->flags & FTRACE_ITER_FILTER)
-			return t_hash_start(m, pos);
-
-		return NULL;
-	}
+	if (!p)
+		return t_hash_start(m, pos);
 
 	return iter;
 }
@@ -2541,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 static int
 ftrace_filter_open(struct inode *inode, struct file *file)
 {
-	return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
-				 inode, file);
+	return ftrace_regex_open(&global_ops,
+			FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
+			inode, file);
 }
 
 static int
-- 
cgit v1.2.3-71-gd317


From d2d45c7a03a2b1a14159cbb665e9dd60991a7d4f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 14:44:09 -0500
Subject: tracing: Have stack_tracer use a separate list of functions

The stack_tracer is used to look at every function and check
if the current stack is bigger than the last recorded max stack size.
When a new max is found, then it saves that stack off.

Currently the stack tracer is limited by the global_ops of
the function tracer. As the stack tracer has nothing to do with
the ftrace function tracer, except that it uses it as its internal
engine, the stack tracer should have its own list.

A new file is added to the tracing debugfs directory called:

  stack_trace_filter

that can be used to select which functions you want to check the stack
on.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 77575b386d97..0398b7c7afd6 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,7 +133,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = stack_trace_call,
-	.flags = FTRACE_OPS_FL_GLOBAL,
 };
 
 static ssize_t
@@ -311,6 +310,21 @@ static const struct file_operations stack_trace_fops = {
 	.release	= seq_release,
 };
 
+static int
+stack_trace_filter_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
+				 inode, file);
+}
+
+static const struct file_operations stack_trace_filter_fops = {
+	.open = stack_trace_filter_open,
+	.read = seq_read,
+	.write = ftrace_filter_write,
+	.llseek = ftrace_regex_lseek,
+	.release = ftrace_regex_release,
+};
+
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
 		   void __user *buffer, size_t *lenp,
@@ -358,6 +372,9 @@ static __init int stack_trace_init(void)
 	trace_create_file("stack_trace", 0444, d_tracer,
 			NULL, &stack_trace_fops);
 
+	trace_create_file("stack_trace_filter", 0444, d_tracer,
+			NULL, &stack_trace_filter_fops);
+
 	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
 
-- 
cgit v1.2.3-71-gd317


From 2a85a37f168d2b4d74d493b578af4dc9032be92e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 21:57:44 -0500
Subject: ftrace: Allow access to the boot time function enabling

Change set_ftrace_early_filter() to ftrace_set_early_filter()
and make it a global function. This will allow other subsystems
in the kernel to be able to enable function tracing at start
up and reuse the ftrace function parsing code.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 3 +++
 kernel/trace/ftrace.c  | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index d1ff0de18970..41df6f501656 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -235,6 +235,9 @@ ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
 loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin);
 int ftrace_regex_release(struct inode *inode, struct file *file);
 
+void __init
+ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable);
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5728d9aa632e..683d559a0eef 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3279,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-static void __init
-set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
+void __init
+ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 {
 	char *func;
 
@@ -3293,9 +3293,9 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 static void __init set_ftrace_early_filters(void)
 {
 	if (ftrace_filter_buf[0])
-		set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
+		ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
 	if (ftrace_notrace_buf[0])
-		set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
+		ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	if (ftrace_graph_buf[0])
 		set_ftrace_early_graph(ftrace_graph_buf);
-- 
cgit v1.2.3-71-gd317


From 762e1207889b3451c50d365b741af6f9ce958886 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Dec 2011 22:01:00 -0500
Subject: tracing: Have stack tracing set filtered functions at boot

Add stacktrace_filter= to the kernel command line that lets
the user pick specific functions to check the stack on.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/kernel-parameters.txt |  8 ++++++++
 kernel/trace/trace_stack.c          | 11 +++++++++++
 2 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fd5c913c33c1..fde2ae06539a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2435,6 +2435,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	stacktrace	[FTRACE]
 			Enabled the stack tracer on boot up.
 
+	stacktrace_filter=[function-list]
+			[FTRACE] Limit the functions that the stack tracer
+			will trace at boot up. function-list is a comma separated
+			list of functions. This list can be changed at run
+			time by the stack_trace_filter file in the debugfs
+			tracing directory. Note, this enables stack tracing
+			and the stacktrace above is not needed.
+
 	sti=		[PARISC,HW]
 			Format: <num>
 			Set the STI (builtin display/keyboard on the HP-PARISC
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0398b7c7afd6..d4545f49242e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,9 @@
 #include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+
+#include <asm/setup.h>
+
 #include "trace.h"
 
 #define STACK_TRACE_ENTRIES 500
@@ -352,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
+static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
+
 static __init int enable_stacktrace(char *str)
 {
+	if (strncmp(str, "_filter=", 8) == 0)
+		strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
+
 	stack_tracer_enabled = 1;
 	last_stack_tracer_enabled = 1;
 	return 1;
@@ -375,6 +383,9 @@ static __init int stack_trace_init(void)
 	trace_create_file("stack_trace_filter", 0444, d_tracer,
 			NULL, &stack_trace_filter_fops);
 
+	if (stack_trace_filter_buf[0])
+		ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
+
 	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
 
-- 
cgit v1.2.3-71-gd317


From 38b78eb855409a05f9d370228bec1955e6878e08 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Dec 2011 14:31:35 -0800
Subject: tracing: Factorize filter creation

There are four places where new filter for a given filter string is
created, which involves several different steps.  This patch factors
those steps into create_[system_]filter() functions which in turn make
use of create_filter_{start|finish}() for common parts.

The only functional change is that if replace_filter_string() is
requested and fails, creation fails without any side effect instead of
being ignored.

Note that system filter is now installed after the processing is
complete which makes freeing before and then restoring filter string
on error unncessary.

-v2: Rebased to resolve conflict with 49aa29513e and updated both
     create_filter() functions to always set *filterp instead of
     requiring the caller to clear it to %NULL on entry.

Link: http://lkml.kernel.org/r/1323988305-1469-2-git-send-email-tj@kernel.org

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 283 +++++++++++++++++++------------------
 1 file changed, 142 insertions(+), 141 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f04cc3136bd3..24aee7127451 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1738,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system,
 	return -ENOMEM;
 }
 
+static int create_filter_start(char *filter_str, bool set_str,
+			       struct filter_parse_state **psp,
+			       struct event_filter **filterp)
+{
+	struct event_filter *filter;
+	struct filter_parse_state *ps = NULL;
+	int err = 0;
+
+	WARN_ON_ONCE(*psp || *filterp);
+
+	/* allocate everything, and if any fails, free all and fail */
+	filter = __alloc_filter();
+	if (filter && set_str)
+		err = replace_filter_string(filter, filter_str);
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+
+	if (!filter || !ps || err) {
+		kfree(ps);
+		__free_filter(filter);
+		return -ENOMEM;
+	}
+
+	/* we're committed to creating a new filter */
+	*filterp = filter;
+	*psp = ps;
+
+	parse_init(ps, filter_ops, filter_str);
+	err = filter_parse(ps);
+	if (err && set_str)
+		append_filter_err(ps, filter);
+	return err;
+}
+
+static void create_filter_finish(struct filter_parse_state *ps)
+{
+	if (ps) {
+		filter_opstack_clear(ps);
+		postfix_clear(ps);
+		kfree(ps);
+	}
+}
+
+/**
+ * create_filter - create a filter for a ftrace_event_call
+ * @call: ftrace_event_call to create a filter for
+ * @filter_str: filter string
+ * @set_str: remember @filter_str and enable detailed error in filter
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Creates a filter for @call with @filter_str.  If @set_str is %true,
+ * @filter_str is copied and recorded in the new filter.
+ *
+ * On success, returns 0 and *@filterp points to the new filter.  On
+ * failure, returns -errno and *@filterp may point to %NULL or to a new
+ * filter.  In the latter case, the returned filter contains error
+ * information if @set_str is %true and the caller is responsible for
+ * freeing it.
+ */
+static int create_filter(struct ftrace_event_call *call,
+			 char *filter_str, bool set_str,
+			 struct event_filter **filterp)
+{
+	struct event_filter *filter = NULL;
+	struct filter_parse_state *ps = NULL;
+	int err;
+
+	err = create_filter_start(filter_str, set_str, &ps, &filter);
+	if (!err) {
+		err = replace_preds(call, filter, ps, filter_str, false);
+		if (err && set_str)
+			append_filter_err(ps, filter);
+	}
+	create_filter_finish(ps);
+
+	*filterp = filter;
+	return err;
+}
+
+/**
+ * create_system_filter - create a filter for an event_subsystem
+ * @system: event_subsystem to create a filter for
+ * @filter_str: filter string
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Identical to create_filter() except that it creates a subsystem filter
+ * and always remembers @filter_str.
+ */
+static int create_system_filter(struct event_subsystem *system,
+				char *filter_str, struct event_filter **filterp)
+{
+	struct event_filter *filter = NULL;
+	struct filter_parse_state *ps = NULL;
+	int err;
+
+	err = create_filter_start(filter_str, true, &ps, &filter);
+	if (!err) {
+		err = replace_system_preds(system, ps, filter_str);
+		if (!err) {
+			/* System filters just show a default message */
+			kfree(filter->filter_string);
+			filter->filter_string = NULL;
+		} else {
+			append_filter_err(ps, filter);
+		}
+	}
+	create_filter_finish(ps);
+
+	*filterp = filter;
+	return err;
+}
+
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
-	struct filter_parse_state *ps;
 	struct event_filter *filter;
-	struct event_filter *tmp;
 	int err = 0;
 
 	mutex_lock(&event_mutex);
@@ -1759,49 +1869,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 		goto out_unlock;
 	}
 
-	err = -ENOMEM;
-	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-	if (!ps)
-		goto out_unlock;
-
-	filter = __alloc_filter();
-	if (!filter) {
-		kfree(ps);
-		goto out_unlock;
-	}
-
-	replace_filter_string(filter, filter_string);
-
-	parse_init(ps, filter_ops, filter_string);
-	err = filter_parse(ps);
-	if (err) {
-		append_filter_err(ps, filter);
-		goto out;
-	}
+	err = create_filter(call, filter_string, true, &filter);
 
-	err = replace_preds(call, filter, ps, filter_string, false);
-	if (err) {
-		filter_disable(call);
-		append_filter_err(ps, filter);
-	} else
-		call->flags |= TRACE_EVENT_FL_FILTERED;
-out:
 	/*
 	 * Always swap the call filter with the new filter
 	 * even if there was an error. If there was an error
 	 * in the filter, we disable the filter and show the error
 	 * string
 	 */
-	tmp = call->filter;
-	rcu_assign_pointer(call->filter, filter);
-	if (tmp) {
-		/* Make sure the call is done with the filter */
-		synchronize_sched();
-		__free_filter(tmp);
+	if (filter) {
+		struct event_filter *tmp = call->filter;
+
+		if (!err)
+			call->flags |= TRACE_EVENT_FL_FILTERED;
+		else
+			filter_disable(call);
+
+		rcu_assign_pointer(call->filter, filter);
+
+		if (tmp) {
+			/* Make sure the call is done with the filter */
+			synchronize_sched();
+			__free_filter(tmp);
+		}
 	}
-	filter_opstack_clear(ps);
-	postfix_clear(ps);
-	kfree(ps);
 out_unlock:
 	mutex_unlock(&event_mutex);
 
@@ -1811,7 +1902,6 @@ out_unlock:
 int apply_subsystem_event_filter(struct event_subsystem *system,
 				 char *filter_string)
 {
-	struct filter_parse_state *ps;
 	struct event_filter *filter;
 	int err = 0;
 
@@ -1835,48 +1925,19 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 		goto out_unlock;
 	}
 
-	err = -ENOMEM;
-	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-	if (!ps)
-		goto out_unlock;
-
-	filter = __alloc_filter();
-	if (!filter)
-		goto out;
-
-	/* System filters just show a default message */
-	kfree(filter->filter_string);
-	filter->filter_string = NULL;
-
-	/*
-	 * No event actually uses the system filter
-	 * we can free it without synchronize_sched().
-	 */
-	__free_filter(system->filter);
-	system->filter = filter;
-
-	parse_init(ps, filter_ops, filter_string);
-	err = filter_parse(ps);
-	if (err)
-		goto err_filter;
-
-	err = replace_system_preds(system, ps, filter_string);
-	if (err)
-		goto err_filter;
-
-out:
-	filter_opstack_clear(ps);
-	postfix_clear(ps);
-	kfree(ps);
+	err = create_system_filter(system, filter_string, &filter);
+	if (filter) {
+		/*
+		 * No event actually uses the system filter
+		 * we can free it without synchronize_sched().
+		 */
+		__free_filter(system->filter);
+		system->filter = filter;
+	}
 out_unlock:
 	mutex_unlock(&event_mutex);
 
 	return err;
-
-err_filter:
-	replace_filter_string(filter, filter_string);
-	append_filter_err(ps, system->filter);
-	goto out;
 }
 
 #ifdef CONFIG_PERF_EVENTS
@@ -1894,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 {
 	int err;
 	struct event_filter *filter;
-	struct filter_parse_state *ps;
 	struct ftrace_event_call *call;
 
 	mutex_lock(&event_mutex);
@@ -1909,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	if (event->filter)
 		goto out_unlock;
 
-	filter = __alloc_filter();
-	if (!filter) {
-		err = PTR_ERR(filter);
-		goto out_unlock;
-	}
-
-	err = -ENOMEM;
-	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-	if (!ps)
-		goto free_filter;
-
-	parse_init(ps, filter_ops, filter_str);
-	err = filter_parse(ps);
-	if (err)
-		goto free_ps;
-
-	err = replace_preds(call, filter, ps, filter_str, false);
+	err = create_filter(call, filter_str, false, &filter);
 	if (!err)
 		event->filter = filter;
-
-free_ps:
-	filter_opstack_clear(ps);
-	postfix_clear(ps);
-	kfree(ps);
-
-free_filter:
-	if (err)
+	else
 		__free_filter(filter);
 
 out_unlock:
@@ -1954,43 +1991,6 @@ out_unlock:
 #define CREATE_TRACE_POINTS
 #include "trace_events_filter_test.h"
 
-static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
-			   struct event_filter **pfilter)
-{
-	struct event_filter *filter;
-	struct filter_parse_state *ps;
-	int err = -ENOMEM;
-
-	filter = __alloc_filter();
-	if (!filter)
-		goto out;
-
-	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-	if (!ps)
-		goto free_filter;
-
-	parse_init(ps, filter_ops, filter_str);
-	err = filter_parse(ps);
-	if (err)
-		goto free_ps;
-
-	err = replace_preds(call, filter, ps, filter_str, false);
-	if (!err)
-		*pfilter = filter;
-
- free_ps:
-	filter_opstack_clear(ps);
-	postfix_clear(ps);
-	kfree(ps);
-
- free_filter:
-	if (err)
-		__free_filter(filter);
-
- out:
-	return err;
-}
-
 #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
 { \
 	.filter = FILTER, \
@@ -2109,12 +2109,13 @@ static __init int ftrace_test_event_filter(void)
 		struct test_filter_data_t *d = &test_filter_data[i];
 		int err;
 
-		err = test_get_filter(d->filter, &event_ftrace_test_filter,
-				      &filter);
+		err = create_filter(&event_ftrace_test_filter, d->filter,
+				    false, &filter);
 		if (err) {
 			printk(KERN_INFO
 			       "Failed to get filter for '%s', err %d\n",
 			       d->filter, err);
+			__free_filter(filter);
 			break;
 		}
 
-- 
cgit v1.2.3-71-gd317


From 7e381b0eb1e1a9805c37335562e8dc02e7d7848c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 21 Dec 2011 20:03:19 +0100
Subject: cgroup: Drop task_lock(parent) on cgroup_fork()

We don't need to hold the parent task_lock() on the
parent in cgroup_fork() because we are already synchronized
against the two places that may change the parent css_set
concurrently:

- cgroup_exit(), but the parent obviously can't exit concurrently
- cgroup migration: we are synchronized against threadgroup_lock()

So we can safely remove the task_lock() there.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Containers <containers@lists.linux-foundation.org>
Cc: Cgroups <cgroups@vger.kernel.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
Cc: Mandeep Singh Baines <msb@chromium.org>
---
 kernel/cgroup.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bc3caff138d8..dae50d0d8e4b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4556,20 +4556,31 @@ static const struct file_operations proc_cgroupstats_operations = {
  *
  * A pointer to the shared css_set was automatically copied in
  * fork.c by dup_task_struct().  However, we ignore that copy, since
- * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
- * have already changed current->cgroups, allowing the previously
- * referenced cgroup group to be removed and freed.
+ * it was not made under the protection of RCU, cgroup_mutex or
+ * threadgroup_change_begin(), so it might no longer be a valid
+ * cgroup pointer.  cgroup_attach_task() might have already changed
+ * current->cgroups, allowing the previously referenced cgroup
+ * group to be removed and freed.
+ *
+ * Outside the pointer validity we also need to process the css_set
+ * inheritance between threadgoup_change_begin() and
+ * threadgoup_change_end(), this way there is no leak in any process
+ * wide migration performed by cgroup_attach_proc() that could otherwise
+ * miss a thread because it is too early or too late in the fork stage.
  *
  * At the point that cgroup_fork() is called, 'current' is the parent
  * task, and the passed argument 'child' points to the child task.
  */
 void cgroup_fork(struct task_struct *child)
 {
-	task_lock(current);
+	/*
+	 * We don't need to task_lock() current because current->cgroups
+	 * can't be changed concurrently here. The parent obviously hasn't
+	 * exited and called cgroup_exit(), and we are synchronized against
+	 * cgroup migration through threadgroup_change_begin().
+	 */
 	child->cgroups = current->cgroups;
 	get_css_set(child->cgroups);
-	task_unlock(current);
 	INIT_LIST_HEAD(&child->cg_list);
 }
 
-- 
cgit v1.2.3-71-gd317


From c84cdf75ccb2845f690579e838f13f7e744e3d23 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 21 Dec 2011 20:03:18 +0100
Subject: cgroup: Remove unnecessary task_lock before fetching css_set on
 migration

When we fetch the css_set of the tasks on cgroup migration, we don't need
anymore to synchronize against cgroup_exit() that could swap the old one
with init_css_set. Now that we are using threadgroup_lock() during
the migrations, we don't need to worry about it anymore.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Mandeep Singh Baines <msb@chromium.org>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Containers <containers@lists.linux-foundation.org>
Cc: Cgroups <cgroups@vger.kernel.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/cgroup.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dae50d0d8e4b..4936d8886b4f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1850,14 +1850,14 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 	struct css_set *newcg;
 
 	/*
-	 * get old css_set. we need to take task_lock and refcount it, because
-	 * an exiting task can change its css_set to init_css_set and drop its
-	 * old one without taking cgroup_mutex.
+	 * get old css_set. We are synchronized through threadgroup_lock()
+	 * against PF_EXITING setting such that we can't race against
+	 * cgroup_exit() changing the css_set to init_css_set and dropping the
+	 * old one.
 	 */
-	task_lock(tsk);
+	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	oldcg = tsk->cgroups;
 	get_css_set(oldcg);
-	task_unlock(tsk);
 
 	/* locate or allocate a new css_set for this task. */
 	if (guarantee) {
@@ -1879,9 +1879,7 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 	}
 	put_css_set(oldcg);
 
-	/* @tsk can't exit as its threadgroup is locked */
 	task_lock(tsk);
-	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	rcu_assign_pointer(tsk->cgroups, newcg);
 	task_unlock(tsk);
 
@@ -2182,11 +2180,13 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		/* nothing to do if this task is already in the cgroup */
 		if (tc->cgrp == cgrp)
 			continue;
-		/* get old css_set pointer */
-		task_lock(tc->task);
+		/*
+		 * get old css_set pointer. threadgroup is locked so this is
+		 * safe against concurrent cgroup_exit() changing this to
+		 * init_css_set.
+		 */
 		oldcg = tc->task->cgroups;
 		get_css_set(oldcg);
-		task_unlock(tc->task);
 		/* see if the new one for us is already in the list? */
 		if (css_set_check_fetched(cgrp, tc->task, oldcg, &newcg_list)) {
 			/* was already there, nothing to do. */
-- 
cgit v1.2.3-71-gd317


From 8a25a2fd126c621f44f3aeaef80d51f00fc11639 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Wed, 21 Dec 2011 14:29:42 -0800
Subject: cpu: convert 'cpu' and 'machinecheck' sysdev_class to a regular
 subsystem

This moves the 'cpu sysdev_class' over to a regular 'cpu' subsystem
and converts the devices to regular devices. The sysdev drivers are
implemented as subsystem interfaces now.

After all sysdev classes are ported to regular driver core entities, the
sysdev implementation will be entirely removed from the kernel.

Userspace relies on events and generic sysfs subsystem infrastructure
from sysdev devices, which are made available with this conversion.

Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@amd64.org>
Cc: Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
Cc: Len Brown <lenb@kernel.org>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/avr32/kernel/cpu.c                         |  74 +++----
 arch/ia64/kernel/err_inject.c                   |  52 ++---
 arch/ia64/kernel/topology.c                     |  10 +-
 arch/powerpc/include/asm/spu.h                  |  12 +-
 arch/powerpc/include/asm/topology.h             |  10 +-
 arch/powerpc/kernel/cacheinfo.c                 |  10 +-
 arch/powerpc/kernel/smp.c                       |   2 +-
 arch/powerpc/kernel/sysfs.c                     | 257 ++++++++++++------------
 arch/powerpc/mm/numa.c                          |   8 +-
 arch/powerpc/platforms/cell/cbe_thermal.c       | 144 ++++++-------
 arch/powerpc/platforms/cell/spu_base.c          |  61 +++---
 arch/powerpc/platforms/pseries/pseries_energy.c |  71 ++++---
 arch/powerpc/sysdev/ppc4xx_cpm.c                |   6 +-
 arch/s390/kernel/smp.c                          |  76 +++----
 arch/s390/kernel/topology.c                     |   6 +-
 arch/sh/kernel/cpu/sh4/sq.c                     |  24 ++-
 arch/sparc/kernel/sysfs.c                       | 122 +++++------
 arch/tile/kernel/sysfs.c                        |  61 +++---
 arch/x86/include/asm/mce.h                      |   2 +-
 arch/x86/kernel/cpu/intel_cacheinfo.c           |  25 ++-
 arch/x86/kernel/cpu/mcheck/mce-internal.h       |   4 +-
 arch/x86/kernel/cpu/mcheck/mce.c                | 128 ++++++------
 arch/x86/kernel/cpu/mcheck/mce_amd.c            |  11 +-
 arch/x86/kernel/cpu/mcheck/therm_throt.c        |  63 +++---
 arch/x86/kernel/microcode_core.c                |  58 +++---
 drivers/acpi/processor_driver.c                 |   6 +-
 drivers/acpi/processor_thermal.c                |   1 -
 drivers/base/cpu.c                              | 146 +++++++-------
 drivers/base/node.c                             |   8 +-
 drivers/base/topology.c                         |  51 +++--
 drivers/cpufreq/cpufreq.c                       |  79 ++++----
 drivers/cpufreq/cpufreq_stats.c                 |   1 -
 drivers/cpuidle/cpuidle.c                       |  12 +-
 drivers/cpuidle/cpuidle.h                       |  10 +-
 drivers/cpuidle/sysfs.c                         |  74 ++++---
 drivers/s390/char/sclp_config.c                 |   8 +-
 include/linux/cpu.h                             |  18 +-
 kernel/sched.c                                  |  40 ++--
 38 files changed, 874 insertions(+), 877 deletions(-)

(limited to 'kernel')

diff --git a/arch/avr32/kernel/cpu.c b/arch/avr32/kernel/cpu.c
index e84faffbbeca..2233be71e2e8 100644
--- a/arch/avr32/kernel/cpu.c
+++ b/arch/avr32/kernel/cpu.c
@@ -6,7 +6,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/seq_file.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
@@ -26,16 +26,16 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
  * XXX: If/when a SMP-capable implementation of AVR32 will ever be
  * made, we must make sure that the code executes on the correct CPU.
  */
-static ssize_t show_pc0event(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pc0event(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "0x%lx\n", (pccr >> 12) & 0x3f);
 }
-static ssize_t store_pc0event(struct sys_device *dev,
-			struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_pc0event(struct device *dev,
+			struct device_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -48,16 +48,16 @@ static ssize_t store_pc0event(struct sys_device *dev,
 	sysreg_write(PCCR, val);
 	return count;
 }
-static ssize_t show_pc0count(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pc0count(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	unsigned long pcnt0;
 
 	pcnt0 = sysreg_read(PCNT0);
 	return sprintf(buf, "%lu\n", pcnt0);
 }
-static ssize_t store_pc0count(struct sys_device *dev,
-				struct sysdev_attribute *attr,
+static ssize_t store_pc0count(struct device *dev,
+				struct device_attribute *attr,
 				const char *buf, size_t count)
 {
 	unsigned long val;
@@ -71,16 +71,16 @@ static ssize_t store_pc0count(struct sys_device *dev,
 	return count;
 }
 
-static ssize_t show_pc1event(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pc1event(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "0x%lx\n", (pccr >> 18) & 0x3f);
 }
-static ssize_t store_pc1event(struct sys_device *dev,
-			      struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_pc1event(struct device *dev,
+			      struct device_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -93,16 +93,16 @@ static ssize_t store_pc1event(struct sys_device *dev,
 	sysreg_write(PCCR, val);
 	return count;
 }
-static ssize_t show_pc1count(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pc1count(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	unsigned long pcnt1;
 
 	pcnt1 = sysreg_read(PCNT1);
 	return sprintf(buf, "%lu\n", pcnt1);
 }
-static ssize_t store_pc1count(struct sys_device *dev,
-				struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_pc1count(struct device *dev,
+				struct device_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -116,16 +116,16 @@ static ssize_t store_pc1count(struct sys_device *dev,
 	return count;
 }
 
-static ssize_t show_pccycles(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pccycles(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	unsigned long pccnt;
 
 	pccnt = sysreg_read(PCCNT);
 	return sprintf(buf, "%lu\n", pccnt);
 }
-static ssize_t store_pccycles(struct sys_device *dev,
-				struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_pccycles(struct device *dev,
+				struct device_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -139,16 +139,16 @@ static ssize_t store_pccycles(struct sys_device *dev,
 	return count;
 }
 
-static ssize_t show_pcenable(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pcenable(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "%c\n", (pccr & 1)?'1':'0');
 }
-static ssize_t store_pcenable(struct sys_device *dev,
-			      struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_pcenable(struct device *dev,
+			      struct device_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long pccr, val;
@@ -167,12 +167,12 @@ static ssize_t store_pcenable(struct sys_device *dev,
 	return count;
 }
 
-static SYSDEV_ATTR(pc0event, 0600, show_pc0event, store_pc0event);
-static SYSDEV_ATTR(pc0count, 0600, show_pc0count, store_pc0count);
-static SYSDEV_ATTR(pc1event, 0600, show_pc1event, store_pc1event);
-static SYSDEV_ATTR(pc1count, 0600, show_pc1count, store_pc1count);
-static SYSDEV_ATTR(pccycles, 0600, show_pccycles, store_pccycles);
-static SYSDEV_ATTR(pcenable, 0600, show_pcenable, store_pcenable);
+static DEVICE_ATTR(pc0event, 0600, show_pc0event, store_pc0event);
+static DEVICE_ATTR(pc0count, 0600, show_pc0count, store_pc0count);
+static DEVICE_ATTR(pc1event, 0600, show_pc1event, store_pc1event);
+static DEVICE_ATTR(pc1count, 0600, show_pc1count, store_pc1count);
+static DEVICE_ATTR(pccycles, 0600, show_pccycles, store_pccycles);
+static DEVICE_ATTR(pcenable, 0600, show_pcenable, store_pcenable);
 
 #endif /* CONFIG_PERFORMANCE_COUNTERS */
 
@@ -186,12 +186,12 @@ static int __init topology_init(void)
 		register_cpu(c, cpu);
 
 #ifdef CONFIG_PERFORMANCE_COUNTERS
-		sysdev_create_file(&c->sysdev, &attr_pc0event);
-		sysdev_create_file(&c->sysdev, &attr_pc0count);
-		sysdev_create_file(&c->sysdev, &attr_pc1event);
-		sysdev_create_file(&c->sysdev, &attr_pc1count);
-		sysdev_create_file(&c->sysdev, &attr_pccycles);
-		sysdev_create_file(&c->sysdev, &attr_pcenable);
+		device_create_file(&c->dev, &dev_attr_pc0event);
+		device_create_file(&c->dev, &dev_attr_pc0count);
+		device_create_file(&c->dev, &dev_attr_pc1event);
+		device_create_file(&c->dev, &dev_attr_pc1count);
+		device_create_file(&c->dev, &dev_attr_pccycles);
+		device_create_file(&c->dev, &dev_attr_pcenable);
 #endif
 	}
 
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index c539c689493b..2d67317a1ec2 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -24,7 +24,7 @@
  * Copyright (C) 2006, Intel Corp.  All rights reserved.
  *
  */
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/cpu.h>
@@ -35,10 +35,10 @@
 #define ERR_DATA_BUFFER_SIZE 3 		// Three 8-byte;
 
 #define define_one_ro(name) 						\
-static SYSDEV_ATTR(name, 0444, show_##name, NULL)
+static DEVICE_ATTR(name, 0444, show_##name, NULL)
 
 #define define_one_rw(name) 						\
-static SYSDEV_ATTR(name, 0644, show_##name, store_##name)
+static DEVICE_ATTR(name, 0644, show_##name, store_##name)
 
 static u64 call_start[NR_CPUS];
 static u64 phys_addr[NR_CPUS];
@@ -55,7 +55,7 @@ static u64 resources[NR_CPUS];
 
 #define show(name) 							\
 static ssize_t 								\
-show_##name(struct sys_device *dev, struct sysdev_attribute *attr,	\
+show_##name(struct device *dev, struct device_attribute *attr,	\
 		char *buf)						\
 {									\
 	u32 cpu=dev->id;						\
@@ -64,7 +64,7 @@ show_##name(struct sys_device *dev, struct sysdev_attribute *attr,	\
 
 #define store(name)							\
 static ssize_t 								\
-store_##name(struct sys_device *dev, struct sysdev_attribute *attr,	\
+store_##name(struct device *dev, struct device_attribute *attr,	\
 					const char *buf, size_t size)	\
 {									\
 	unsigned int cpu=dev->id;					\
@@ -78,7 +78,7 @@ show(call_start)
  * processor. The cpu number in driver is only used for storing data.
  */
 static ssize_t
-store_call_start(struct sys_device *dev, struct sysdev_attribute *attr,
+store_call_start(struct device *dev, struct device_attribute *attr,
 		const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
@@ -127,7 +127,7 @@ show(err_type_info)
 store(err_type_info)
 
 static ssize_t
-show_virtual_to_phys(struct sys_device *dev, struct sysdev_attribute *attr,
+show_virtual_to_phys(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
 	unsigned int cpu=dev->id;
@@ -135,7 +135,7 @@ show_virtual_to_phys(struct sys_device *dev, struct sysdev_attribute *attr,
 }
 
 static ssize_t
-store_virtual_to_phys(struct sys_device *dev, struct sysdev_attribute *attr,
+store_virtual_to_phys(struct device *dev, struct device_attribute *attr,
 			const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
@@ -159,8 +159,8 @@ show(err_struct_info)
 store(err_struct_info)
 
 static ssize_t
-show_err_data_buffer(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+show_err_data_buffer(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	unsigned int cpu=dev->id;
 
@@ -171,8 +171,8 @@ show_err_data_buffer(struct sys_device *dev,
 }
 
 static ssize_t
-store_err_data_buffer(struct sys_device *dev,
-			struct sysdev_attribute *attr,
+store_err_data_buffer(struct device *dev,
+			struct device_attribute *attr,
 			const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
@@ -209,14 +209,14 @@ define_one_ro(capabilities);
 define_one_ro(resources);
 
 static struct attribute *default_attrs[] = {
-	&attr_call_start.attr,
-	&attr_virtual_to_phys.attr,
-	&attr_err_type_info.attr,
-	&attr_err_struct_info.attr,
-	&attr_err_data_buffer.attr,
-	&attr_status.attr,
-	&attr_capabilities.attr,
-	&attr_resources.attr,
+	&dev_attr_call_start.attr,
+	&dev_attr_virtual_to_phys.attr,
+	&dev_attr_err_type_info.attr,
+	&dev_attr_err_struct_info.attr,
+	&dev_attr_err_data_buffer.attr,
+	&dev_attr_status.attr,
+	&dev_attr_capabilities.attr,
+	&dev_attr_resources.attr,
 	NULL
 };
 
@@ -225,12 +225,12 @@ static struct attribute_group err_inject_attr_group = {
 	.name = "err_inject"
 };
 /* Add/Remove err_inject interface for CPU device */
-static int __cpuinit err_inject_add_dev(struct sys_device * sys_dev)
+static int __cpuinit err_inject_add_dev(struct device * sys_dev)
 {
 	return sysfs_create_group(&sys_dev->kobj, &err_inject_attr_group);
 }
 
-static int __cpuinit err_inject_remove_dev(struct sys_device * sys_dev)
+static int __cpuinit err_inject_remove_dev(struct device * sys_dev)
 {
 	sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group);
 	return 0;
@@ -239,9 +239,9 @@ static int __cpuinit err_inject_cpu_callback(struct notifier_block *nfb,
 		unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *sys_dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	sys_dev = get_cpu_device(cpu);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
@@ -283,13 +283,13 @@ static void __exit
 err_inject_exit(void)
 {
 	int i;
-	struct sys_device *sys_dev;
+	struct device *sys_dev;
 
 #ifdef ERR_INJ_DEBUG
 	printk(KERN_INFO "Exit error injection driver.\n");
 #endif
 	for_each_online_cpu(i) {
-		sys_dev = get_cpu_sysdev(i);
+		sys_dev = get_cpu_device(i);
 		sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group);
 	}
 	unregister_hotcpu_notifier(&err_inject_cpu_notifier);
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 9be1f11a01d9..9deb21dbf629 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -350,7 +350,7 @@ static int __cpuinit cpu_cache_sysfs_init(unsigned int cpu)
 }
 
 /* Add cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
+static int __cpuinit cache_add_dev(struct device * sys_dev)
 {
 	unsigned int cpu = sys_dev->id;
 	unsigned long i, j;
@@ -400,7 +400,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 }
 
 /* Remove cache interface for CPU device */
-static int __cpuinit cache_remove_dev(struct sys_device * sys_dev)
+static int __cpuinit cache_remove_dev(struct device * sys_dev)
 {
 	unsigned int cpu = sys_dev->id;
 	unsigned long i;
@@ -428,9 +428,9 @@ static int __cpuinit cache_cpu_callback(struct notifier_block *nfb,
 		unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *sys_dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	sys_dev = get_cpu_device(cpu);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
@@ -454,7 +454,7 @@ static int __init cache_sysfs_init(void)
 	int i;
 
 	for_each_online_cpu(i) {
-		struct sys_device *sys_dev = get_cpu_sysdev((unsigned int)i);
+		struct device *sys_dev = get_cpu_device((unsigned int)i);
 		cache_add_dev(sys_dev);
 	}
 
diff --git a/arch/powerpc/include/asm/spu.h b/arch/powerpc/include/asm/spu.h
index 4e360bd4a35a..fff921345ddc 100644
--- a/arch/powerpc/include/asm/spu.h
+++ b/arch/powerpc/include/asm/spu.h
@@ -25,7 +25,7 @@
 #ifdef __KERNEL__
 
 #include <linux/workqueue.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/mutex.h>
 
 #define LS_SIZE (256 * 1024)
@@ -166,7 +166,7 @@ struct spu {
 	/* beat only */
 	u64 shadow_int_mask_RW[3];
 
-	struct sys_device sysdev;
+	struct device dev;
 
 	int has_mem_affinity;
 	struct list_head aff_list;
@@ -270,11 +270,11 @@ struct spufs_calls {
 int register_spu_syscalls(struct spufs_calls *calls);
 void unregister_spu_syscalls(struct spufs_calls *calls);
 
-int spu_add_sysdev_attr(struct sysdev_attribute *attr);
-void spu_remove_sysdev_attr(struct sysdev_attribute *attr);
+int spu_add_dev_attr(struct device_attribute *attr);
+void spu_remove_dev_attr(struct device_attribute *attr);
 
-int spu_add_sysdev_attr_group(struct attribute_group *attrs);
-void spu_remove_sysdev_attr_group(struct attribute_group *attrs);
+int spu_add_dev_attr_group(struct attribute_group *attrs);
+void spu_remove_dev_attr_group(struct attribute_group *attrs);
 
 int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
 		unsigned long dsisr, unsigned *flt);
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 1e104af08483..c97185885c6d 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -3,7 +3,7 @@
 #ifdef __KERNEL__
 
 
-struct sys_device;
+struct device;
 struct device_node;
 
 #ifdef CONFIG_NUMA
@@ -86,19 +86,19 @@ extern int __node_distance(int, int);
 
 extern void __init dump_numa_cpu_topology(void);
 
-extern int sysfs_add_device_to_node(struct sys_device *dev, int nid);
-extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid);
+extern int sysfs_add_device_to_node(struct device *dev, int nid);
+extern void sysfs_remove_device_from_node(struct device *dev, int nid);
 
 #else
 
 static inline void dump_numa_cpu_topology(void) {}
 
-static inline int sysfs_add_device_to_node(struct sys_device *dev, int nid)
+static inline int sysfs_add_device_to_node(struct device *dev, int nid)
 {
 	return 0;
 }
 
-static inline void sysfs_remove_device_from_node(struct sys_device *dev,
+static inline void sysfs_remove_device_from_node(struct device *dev,
 						int nid)
 {
 }
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index a3c684b4c862..92c6b008dd2b 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -451,15 +451,15 @@ out:
 static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id)
 {
 	struct cache_dir *cache_dir;
-	struct sys_device *sysdev;
+	struct device *dev;
 	struct kobject *kobj = NULL;
 
-	sysdev = get_cpu_sysdev(cpu_id);
-	WARN_ONCE(!sysdev, "no sysdev for CPU %i\n", cpu_id);
-	if (!sysdev)
+	dev = get_cpu_device(cpu_id);
+	WARN_ONCE(!dev, "no dev for CPU %i\n", cpu_id);
+	if (!dev)
 		goto err;
 
-	kobj = kobject_create_and_add("cache", &sysdev->kobj);
+	kobj = kobject_create_and_add("cache", &dev->kobj);
 	if (!kobj)
 		goto err;
 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 25ddbfc7dd36..da08240353fa 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -27,7 +27,7 @@
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/err.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index ce035c1905f0..f396ef27916b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -1,4 +1,4 @@
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
@@ -37,12 +37,12 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
 /* Time in microseconds we delay before sleeping in the idle loop */
 DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 };
 
-static ssize_t store_smt_snooze_delay(struct sys_device *dev,
-				      struct sysdev_attribute *attr,
+static ssize_t store_smt_snooze_delay(struct device *dev,
+				      struct device_attribute *attr,
 				      const char *buf,
 				      size_t count)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 	ssize_t ret;
 	long snooze;
 
@@ -50,21 +50,21 @@ static ssize_t store_smt_snooze_delay(struct sys_device *dev,
 	if (ret != 1)
 		return -EINVAL;
 
-	per_cpu(smt_snooze_delay, cpu->sysdev.id) = snooze;
+	per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
 
 	return count;
 }
 
-static ssize_t show_smt_snooze_delay(struct sys_device *dev,
-				     struct sysdev_attribute *attr,
+static ssize_t show_smt_snooze_delay(struct device *dev,
+				     struct device_attribute *attr,
 				     char *buf)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 
-	return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->sysdev.id));
+	return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id));
 }
 
-static SYSDEV_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
+static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
 		   store_smt_snooze_delay);
 
 static int __init setup_smt_snooze_delay(char *str)
@@ -117,25 +117,25 @@ static void write_##NAME(void *val) \
 	ppc_enable_pmcs(); \
 	mtspr(ADDRESS, *(unsigned long *)val);	\
 } \
-static ssize_t show_##NAME(struct sys_device *dev, \
-			struct sysdev_attribute *attr, \
+static ssize_t show_##NAME(struct device *dev, \
+			struct device_attribute *attr, \
 			char *buf) \
 { \
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
+	struct cpu *cpu = container_of(dev, struct cpu, dev); \
 	unsigned long val; \
-	smp_call_function_single(cpu->sysdev.id, read_##NAME, &val, 1);	\
+	smp_call_function_single(cpu->dev.id, read_##NAME, &val, 1);	\
 	return sprintf(buf, "%lx\n", val); \
 } \
 static ssize_t __used \
-	store_##NAME(struct sys_device *dev, struct sysdev_attribute *attr, \
+	store_##NAME(struct device *dev, struct device_attribute *attr, \
 			const char *buf, size_t count) \
 { \
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
+	struct cpu *cpu = container_of(dev, struct cpu, dev); \
 	unsigned long val; \
 	int ret = sscanf(buf, "%lx", &val); \
 	if (ret != 1) \
 		return -EINVAL; \
-	smp_call_function_single(cpu->sysdev.id, write_##NAME, &val, 1); \
+	smp_call_function_single(cpu->dev.id, write_##NAME, &val, 1); \
 	return count; \
 }
 
@@ -178,22 +178,22 @@ SYSFS_PMCSETUP(purr, SPRN_PURR);
 SYSFS_PMCSETUP(spurr, SPRN_SPURR);
 SYSFS_PMCSETUP(dscr, SPRN_DSCR);
 
-static SYSDEV_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
-static SYSDEV_ATTR(spurr, 0600, show_spurr, NULL);
-static SYSDEV_ATTR(dscr, 0600, show_dscr, store_dscr);
-static SYSDEV_ATTR(purr, 0600, show_purr, store_purr);
+static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
+static DEVICE_ATTR(spurr, 0600, show_spurr, NULL);
+static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr);
+static DEVICE_ATTR(purr, 0600, show_purr, store_purr);
 
 unsigned long dscr_default = 0;
 EXPORT_SYMBOL(dscr_default);
 
-static ssize_t show_dscr_default(struct sysdev_class *class,
-		struct sysdev_class_attribute *attr, char *buf)
+static ssize_t show_dscr_default(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%lx\n", dscr_default);
 }
 
-static ssize_t __used store_dscr_default(struct sysdev_class *class,
-		struct sysdev_class_attribute *attr, const char *buf,
+static ssize_t __used store_dscr_default(struct device *dev,
+		struct device_attribute *attr, const char *buf,
 		size_t count)
 {
 	unsigned long val;
@@ -207,15 +207,14 @@ static ssize_t __used store_dscr_default(struct sysdev_class *class,
 	return count;
 }
 
-static SYSDEV_CLASS_ATTR(dscr_default, 0600,
+static DEVICE_ATTR(dscr_default, 0600,
 		show_dscr_default, store_dscr_default);
 
 static void sysfs_create_dscr_default(void)
 {
 	int err = 0;
 	if (cpu_has_feature(CPU_FTR_DSCR))
-		err = sysfs_create_file(&cpu_sysdev_class.kset.kobj,
-			&attr_dscr_default.attr);
+		err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default);
 }
 #endif /* CONFIG_PPC64 */
 
@@ -259,72 +258,72 @@ SYSFS_PMCSETUP(tsr3, SPRN_PA6T_TSR3);
 #endif /* HAS_PPC_PMC_PA6T */
 
 #ifdef HAS_PPC_PMC_IBM
-static struct sysdev_attribute ibm_common_attrs[] = {
-	_SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
-	_SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+static struct device_attribute ibm_common_attrs[] = {
+	__ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+	__ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
 };
 #endif /* HAS_PPC_PMC_G4 */
 
 #ifdef HAS_PPC_PMC_G4
-static struct sysdev_attribute g4_common_attrs[] = {
-	_SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
-	_SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
-	_SYSDEV_ATTR(mmcr2, 0600, show_mmcr2, store_mmcr2),
+static struct device_attribute g4_common_attrs[] = {
+	__ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+	__ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+	__ATTR(mmcr2, 0600, show_mmcr2, store_mmcr2),
 };
 #endif /* HAS_PPC_PMC_G4 */
 
-static struct sysdev_attribute classic_pmc_attrs[] = {
-	_SYSDEV_ATTR(pmc1, 0600, show_pmc1, store_pmc1),
-	_SYSDEV_ATTR(pmc2, 0600, show_pmc2, store_pmc2),
-	_SYSDEV_ATTR(pmc3, 0600, show_pmc3, store_pmc3),
-	_SYSDEV_ATTR(pmc4, 0600, show_pmc4, store_pmc4),
-	_SYSDEV_ATTR(pmc5, 0600, show_pmc5, store_pmc5),
-	_SYSDEV_ATTR(pmc6, 0600, show_pmc6, store_pmc6),
+static struct device_attribute classic_pmc_attrs[] = {
+	__ATTR(pmc1, 0600, show_pmc1, store_pmc1),
+	__ATTR(pmc2, 0600, show_pmc2, store_pmc2),
+	__ATTR(pmc3, 0600, show_pmc3, store_pmc3),
+	__ATTR(pmc4, 0600, show_pmc4, store_pmc4),
+	__ATTR(pmc5, 0600, show_pmc5, store_pmc5),
+	__ATTR(pmc6, 0600, show_pmc6, store_pmc6),
 #ifdef CONFIG_PPC64
-	_SYSDEV_ATTR(pmc7, 0600, show_pmc7, store_pmc7),
-	_SYSDEV_ATTR(pmc8, 0600, show_pmc8, store_pmc8),
+	__ATTR(pmc7, 0600, show_pmc7, store_pmc7),
+	__ATTR(pmc8, 0600, show_pmc8, store_pmc8),
 #endif
 };
 
 #ifdef HAS_PPC_PMC_PA6T
-static struct sysdev_attribute pa6t_attrs[] = {
-	_SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
-	_SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
-	_SYSDEV_ATTR(pmc0, 0600, show_pa6t_pmc0, store_pa6t_pmc0),
-	_SYSDEV_ATTR(pmc1, 0600, show_pa6t_pmc1, store_pa6t_pmc1),
-	_SYSDEV_ATTR(pmc2, 0600, show_pa6t_pmc2, store_pa6t_pmc2),
-	_SYSDEV_ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3),
-	_SYSDEV_ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4),
-	_SYSDEV_ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5),
+static struct device_attribute pa6t_attrs[] = {
+	__ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+	__ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+	__ATTR(pmc0, 0600, show_pa6t_pmc0, store_pa6t_pmc0),
+	__ATTR(pmc1, 0600, show_pa6t_pmc1, store_pa6t_pmc1),
+	__ATTR(pmc2, 0600, show_pa6t_pmc2, store_pa6t_pmc2),
+	__ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3),
+	__ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4),
+	__ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5),
 #ifdef CONFIG_DEBUG_KERNEL
-	_SYSDEV_ATTR(hid0, 0600, show_hid0, store_hid0),
-	_SYSDEV_ATTR(hid1, 0600, show_hid1, store_hid1),
-	_SYSDEV_ATTR(hid4, 0600, show_hid4, store_hid4),
-	_SYSDEV_ATTR(hid5, 0600, show_hid5, store_hid5),
-	_SYSDEV_ATTR(ima0, 0600, show_ima0, store_ima0),
-	_SYSDEV_ATTR(ima1, 0600, show_ima1, store_ima1),
-	_SYSDEV_ATTR(ima2, 0600, show_ima2, store_ima2),
-	_SYSDEV_ATTR(ima3, 0600, show_ima3, store_ima3),
-	_SYSDEV_ATTR(ima4, 0600, show_ima4, store_ima4),
-	_SYSDEV_ATTR(ima5, 0600, show_ima5, store_ima5),
-	_SYSDEV_ATTR(ima6, 0600, show_ima6, store_ima6),
-	_SYSDEV_ATTR(ima7, 0600, show_ima7, store_ima7),
-	_SYSDEV_ATTR(ima8, 0600, show_ima8, store_ima8),
-	_SYSDEV_ATTR(ima9, 0600, show_ima9, store_ima9),
-	_SYSDEV_ATTR(imaat, 0600, show_imaat, store_imaat),
-	_SYSDEV_ATTR(btcr, 0600, show_btcr, store_btcr),
-	_SYSDEV_ATTR(pccr, 0600, show_pccr, store_pccr),
-	_SYSDEV_ATTR(rpccr, 0600, show_rpccr, store_rpccr),
-	_SYSDEV_ATTR(der, 0600, show_der, store_der),
-	_SYSDEV_ATTR(mer, 0600, show_mer, store_mer),
-	_SYSDEV_ATTR(ber, 0600, show_ber, store_ber),
-	_SYSDEV_ATTR(ier, 0600, show_ier, store_ier),
-	_SYSDEV_ATTR(sier, 0600, show_sier, store_sier),
-	_SYSDEV_ATTR(siar, 0600, show_siar, store_siar),
-	_SYSDEV_ATTR(tsr0, 0600, show_tsr0, store_tsr0),
-	_SYSDEV_ATTR(tsr1, 0600, show_tsr1, store_tsr1),
-	_SYSDEV_ATTR(tsr2, 0600, show_tsr2, store_tsr2),
-	_SYSDEV_ATTR(tsr3, 0600, show_tsr3, store_tsr3),
+	__ATTR(hid0, 0600, show_hid0, store_hid0),
+	__ATTR(hid1, 0600, show_hid1, store_hid1),
+	__ATTR(hid4, 0600, show_hid4, store_hid4),
+	__ATTR(hid5, 0600, show_hid5, store_hid5),
+	__ATTR(ima0, 0600, show_ima0, store_ima0),
+	__ATTR(ima1, 0600, show_ima1, store_ima1),
+	__ATTR(ima2, 0600, show_ima2, store_ima2),
+	__ATTR(ima3, 0600, show_ima3, store_ima3),
+	__ATTR(ima4, 0600, show_ima4, store_ima4),
+	__ATTR(ima5, 0600, show_ima5, store_ima5),
+	__ATTR(ima6, 0600, show_ima6, store_ima6),
+	__ATTR(ima7, 0600, show_ima7, store_ima7),
+	__ATTR(ima8, 0600, show_ima8, store_ima8),
+	__ATTR(ima9, 0600, show_ima9, store_ima9),
+	__ATTR(imaat, 0600, show_imaat, store_imaat),
+	__ATTR(btcr, 0600, show_btcr, store_btcr),
+	__ATTR(pccr, 0600, show_pccr, store_pccr),
+	__ATTR(rpccr, 0600, show_rpccr, store_rpccr),
+	__ATTR(der, 0600, show_der, store_der),
+	__ATTR(mer, 0600, show_mer, store_mer),
+	__ATTR(ber, 0600, show_ber, store_ber),
+	__ATTR(ier, 0600, show_ier, store_ier),
+	__ATTR(sier, 0600, show_sier, store_sier),
+	__ATTR(siar, 0600, show_siar, store_siar),
+	__ATTR(tsr0, 0600, show_tsr0, store_tsr0),
+	__ATTR(tsr1, 0600, show_tsr1, store_tsr1),
+	__ATTR(tsr2, 0600, show_tsr2, store_tsr2),
+	__ATTR(tsr3, 0600, show_tsr3, store_tsr3),
 #endif /* CONFIG_DEBUG_KERNEL */
 };
 #endif /* HAS_PPC_PMC_PA6T */
@@ -333,14 +332,14 @@ static struct sysdev_attribute pa6t_attrs[] = {
 static void __cpuinit register_cpu_online(unsigned int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
-	struct sysdev_attribute *attrs, *pmc_attrs;
+	struct device *s = &c->dev;
+	struct device_attribute *attrs, *pmc_attrs;
 	int i, nattrs;
 
 #ifdef CONFIG_PPC64
 	if (!firmware_has_feature(FW_FEATURE_ISERIES) &&
 			cpu_has_feature(CPU_FTR_SMT))
-		sysdev_create_file(s, &attr_smt_snooze_delay);
+		device_create_file(s, &dev_attr_smt_snooze_delay);
 #endif
 
 	/* PMC stuff */
@@ -348,14 +347,14 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
 #ifdef HAS_PPC_PMC_IBM
 	case PPC_PMC_IBM:
 		attrs = ibm_common_attrs;
-		nattrs = sizeof(ibm_common_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = classic_pmc_attrs;
 		break;
 #endif /* HAS_PPC_PMC_IBM */
 #ifdef HAS_PPC_PMC_G4
 	case PPC_PMC_G4:
 		attrs = g4_common_attrs;
-		nattrs = sizeof(g4_common_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = classic_pmc_attrs;
 		break;
 #endif /* HAS_PPC_PMC_G4 */
@@ -363,7 +362,7 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
 	case PPC_PMC_PA6T:
 		/* PA Semi starts counting at PMC0 */
 		attrs = pa6t_attrs;
-		nattrs = sizeof(pa6t_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = NULL;
 		break;
 #endif /* HAS_PPC_PMC_PA6T */
@@ -374,24 +373,24 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
 	}
 
 	for (i = 0; i < nattrs; i++)
-		sysdev_create_file(s, &attrs[i]);
+		device_create_file(s, &attrs[i]);
 
 	if (pmc_attrs)
 		for (i = 0; i < cur_cpu_spec->num_pmcs; i++)
-			sysdev_create_file(s, &pmc_attrs[i]);
+			device_create_file(s, &pmc_attrs[i]);
 
 #ifdef CONFIG_PPC64
 	if (cpu_has_feature(CPU_FTR_MMCRA))
-		sysdev_create_file(s, &attr_mmcra);
+		device_create_file(s, &dev_attr_mmcra);
 
 	if (cpu_has_feature(CPU_FTR_PURR))
-		sysdev_create_file(s, &attr_purr);
+		device_create_file(s, &dev_attr_purr);
 
 	if (cpu_has_feature(CPU_FTR_SPURR))
-		sysdev_create_file(s, &attr_spurr);
+		device_create_file(s, &dev_attr_spurr);
 
 	if (cpu_has_feature(CPU_FTR_DSCR))
-		sysdev_create_file(s, &attr_dscr);
+		device_create_file(s, &dev_attr_dscr);
 #endif /* CONFIG_PPC64 */
 
 	cacheinfo_cpu_online(cpu);
@@ -401,8 +400,8 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
 static void unregister_cpu_online(unsigned int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
-	struct sysdev_attribute *attrs, *pmc_attrs;
+	struct device *s = &c->dev;
+	struct device_attribute *attrs, *pmc_attrs;
 	int i, nattrs;
 
 	BUG_ON(!c->hotpluggable);
@@ -410,7 +409,7 @@ static void unregister_cpu_online(unsigned int cpu)
 #ifdef CONFIG_PPC64
 	if (!firmware_has_feature(FW_FEATURE_ISERIES) &&
 			cpu_has_feature(CPU_FTR_SMT))
-		sysdev_remove_file(s, &attr_smt_snooze_delay);
+		device_remove_file(s, &dev_attr_smt_snooze_delay);
 #endif
 
 	/* PMC stuff */
@@ -418,14 +417,14 @@ static void unregister_cpu_online(unsigned int cpu)
 #ifdef HAS_PPC_PMC_IBM
 	case PPC_PMC_IBM:
 		attrs = ibm_common_attrs;
-		nattrs = sizeof(ibm_common_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = classic_pmc_attrs;
 		break;
 #endif /* HAS_PPC_PMC_IBM */
 #ifdef HAS_PPC_PMC_G4
 	case PPC_PMC_G4:
 		attrs = g4_common_attrs;
-		nattrs = sizeof(g4_common_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = classic_pmc_attrs;
 		break;
 #endif /* HAS_PPC_PMC_G4 */
@@ -433,7 +432,7 @@ static void unregister_cpu_online(unsigned int cpu)
 	case PPC_PMC_PA6T:
 		/* PA Semi starts counting at PMC0 */
 		attrs = pa6t_attrs;
-		nattrs = sizeof(pa6t_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = NULL;
 		break;
 #endif /* HAS_PPC_PMC_PA6T */
@@ -444,24 +443,24 @@ static void unregister_cpu_online(unsigned int cpu)
 	}
 
 	for (i = 0; i < nattrs; i++)
-		sysdev_remove_file(s, &attrs[i]);
+		device_remove_file(s, &attrs[i]);
 
 	if (pmc_attrs)
 		for (i = 0; i < cur_cpu_spec->num_pmcs; i++)
-			sysdev_remove_file(s, &pmc_attrs[i]);
+			device_remove_file(s, &pmc_attrs[i]);
 
 #ifdef CONFIG_PPC64
 	if (cpu_has_feature(CPU_FTR_MMCRA))
-		sysdev_remove_file(s, &attr_mmcra);
+		device_remove_file(s, &dev_attr_mmcra);
 
 	if (cpu_has_feature(CPU_FTR_PURR))
-		sysdev_remove_file(s, &attr_purr);
+		device_remove_file(s, &dev_attr_purr);
 
 	if (cpu_has_feature(CPU_FTR_SPURR))
-		sysdev_remove_file(s, &attr_spurr);
+		device_remove_file(s, &dev_attr_spurr);
 
 	if (cpu_has_feature(CPU_FTR_DSCR))
-		sysdev_remove_file(s, &attr_dscr);
+		device_remove_file(s, &dev_attr_dscr);
 #endif /* CONFIG_PPC64 */
 
 	cacheinfo_cpu_offline(cpu);
@@ -513,70 +512,70 @@ static struct notifier_block __cpuinitdata sysfs_cpu_nb = {
 
 static DEFINE_MUTEX(cpu_mutex);
 
-int cpu_add_sysdev_attr(struct sysdev_attribute *attr)
+int cpu_add_dev_attr(struct device_attribute *attr)
 {
 	int cpu;
 
 	mutex_lock(&cpu_mutex);
 
 	for_each_possible_cpu(cpu) {
-		sysdev_create_file(get_cpu_sysdev(cpu), attr);
+		device_create_file(get_cpu_device(cpu), attr);
 	}
 
 	mutex_unlock(&cpu_mutex);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(cpu_add_sysdev_attr);
+EXPORT_SYMBOL_GPL(cpu_add_dev_attr);
 
-int cpu_add_sysdev_attr_group(struct attribute_group *attrs)
+int cpu_add_dev_attr_group(struct attribute_group *attrs)
 {
 	int cpu;
-	struct sys_device *sysdev;
+	struct device *dev;
 	int ret;
 
 	mutex_lock(&cpu_mutex);
 
 	for_each_possible_cpu(cpu) {
-		sysdev = get_cpu_sysdev(cpu);
-		ret = sysfs_create_group(&sysdev->kobj, attrs);
+		dev = get_cpu_device(cpu);
+		ret = sysfs_create_group(&dev->kobj, attrs);
 		WARN_ON(ret != 0);
 	}
 
 	mutex_unlock(&cpu_mutex);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(cpu_add_sysdev_attr_group);
+EXPORT_SYMBOL_GPL(cpu_add_dev_attr_group);
 
 
-void cpu_remove_sysdev_attr(struct sysdev_attribute *attr)
+void cpu_remove_dev_attr(struct device_attribute *attr)
 {
 	int cpu;
 
 	mutex_lock(&cpu_mutex);
 
 	for_each_possible_cpu(cpu) {
-		sysdev_remove_file(get_cpu_sysdev(cpu), attr);
+		device_remove_file(get_cpu_device(cpu), attr);
 	}
 
 	mutex_unlock(&cpu_mutex);
 }
-EXPORT_SYMBOL_GPL(cpu_remove_sysdev_attr);
+EXPORT_SYMBOL_GPL(cpu_remove_dev_attr);
 
-void cpu_remove_sysdev_attr_group(struct attribute_group *attrs)
+void cpu_remove_dev_attr_group(struct attribute_group *attrs)
 {
 	int cpu;
-	struct sys_device *sysdev;
+	struct device *dev;
 
 	mutex_lock(&cpu_mutex);
 
 	for_each_possible_cpu(cpu) {
-		sysdev = get_cpu_sysdev(cpu);
-		sysfs_remove_group(&sysdev->kobj, attrs);
+		dev = get_cpu_device(cpu);
+		sysfs_remove_group(&dev->kobj, attrs);
 	}
 
 	mutex_unlock(&cpu_mutex);
 }
-EXPORT_SYMBOL_GPL(cpu_remove_sysdev_attr_group);
+EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group);
 
 
 /* NUMA stuff */
@@ -590,7 +589,7 @@ static void register_nodes(void)
 		register_one_node(i);
 }
 
-int sysfs_add_device_to_node(struct sys_device *dev, int nid)
+int sysfs_add_device_to_node(struct device *dev, int nid)
 {
 	struct node *node = &node_devices[nid];
 	return sysfs_create_link(&node->sysdev.kobj, &dev->kobj,
@@ -598,7 +597,7 @@ int sysfs_add_device_to_node(struct sys_device *dev, int nid)
 }
 EXPORT_SYMBOL_GPL(sysfs_add_device_to_node);
 
-void sysfs_remove_device_from_node(struct sys_device *dev, int nid)
+void sysfs_remove_device_from_node(struct device *dev, int nid)
 {
 	struct node *node = &node_devices[nid];
 	sysfs_remove_link(&node->sysdev.kobj, kobject_name(&dev->kobj));
@@ -614,14 +613,14 @@ static void register_nodes(void)
 #endif
 
 /* Only valid if CPU is present. */
-static ssize_t show_physical_id(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_physical_id(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 
-	return sprintf(buf, "%d\n", get_hard_smp_processor_id(cpu->sysdev.id));
+	return sprintf(buf, "%d\n", get_hard_smp_processor_id(cpu->dev.id));
 }
-static SYSDEV_ATTR(physical_id, 0444, show_physical_id, NULL);
+static DEVICE_ATTR(physical_id, 0444, show_physical_id, NULL);
 
 static int __init topology_init(void)
 {
@@ -646,7 +645,7 @@ static int __init topology_init(void)
 		if (cpu_online(cpu) || c->hotpluggable) {
 			register_cpu(c, cpu);
 
-			sysdev_create_file(&c->sysdev, &attr_physical_id);
+			device_create_file(&c->dev, &dev_attr_physical_id);
 		}
 
 		if (cpu_online(cpu))
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index c7dd4dec4df8..f2b03a863430 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1452,7 +1452,7 @@ int arch_update_cpu_topology(void)
 {
 	int cpu, nid, old_nid;
 	unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
-	struct sys_device *sysdev;
+	struct device *dev;
 
 	for_each_cpu(cpu,&cpu_associativity_changes_mask) {
 		vphn_get_associativity(cpu, associativity);
@@ -1473,9 +1473,9 @@ int arch_update_cpu_topology(void)
 		register_cpu_under_node(cpu, nid);
 		put_online_cpus();
 
-		sysdev = get_cpu_sysdev(cpu);
-		if (sysdev)
-			kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
+		dev = get_cpu_device(cpu);
+		if (dev)
+			kobject_uevent(&dev->kobj, KOBJ_CHANGE);
 	}
 
 	return 1;
diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c
index 4d4c8c169124..94560db788bf 100644
--- a/arch/powerpc/platforms/cell/cbe_thermal.c
+++ b/arch/powerpc/platforms/cell/cbe_thermal.c
@@ -46,7 +46,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/cpu.h>
 #include <asm/spu.h>
@@ -59,8 +59,8 @@
 #define TEMP_MIN 65
 #define TEMP_MAX 125
 
-#define SYSDEV_PREFIX_ATTR(_prefix,_name,_mode)			\
-struct sysdev_attribute attr_ ## _prefix ## _ ## _name = {	\
+#define DEVICE_PREFIX_ATTR(_prefix,_name,_mode)			\
+struct device_attribute attr_ ## _prefix ## _ ## _name = {	\
 	.attr = { .name = __stringify(_name), .mode = _mode },	\
 	.show	= _prefix ## _show_ ## _name,			\
 	.store	= _prefix ## _store_ ## _name,			\
@@ -76,36 +76,36 @@ static inline u8 temp_to_reg(u8 temp)
 	return ((temp - TEMP_MIN) >> 1) & 0x3f;
 }
 
-static struct cbe_pmd_regs __iomem *get_pmd_regs(struct sys_device *sysdev)
+static struct cbe_pmd_regs __iomem *get_pmd_regs(struct device *dev)
 {
 	struct spu *spu;
 
-	spu = container_of(sysdev, struct spu, sysdev);
+	spu = container_of(dev, struct spu, dev);
 
 	return cbe_get_pmd_regs(spu_devnode(spu));
 }
 
 /* returns the value for a given spu in a given register */
-static u8 spu_read_register_value(struct sys_device *sysdev, union spe_reg __iomem *reg)
+static u8 spu_read_register_value(struct device *dev, union spe_reg __iomem *reg)
 {
 	union spe_reg value;
 	struct spu *spu;
 
-	spu = container_of(sysdev, struct spu, sysdev);
+	spu = container_of(dev, struct spu, dev);
 	value.val = in_be64(&reg->val);
 
 	return value.spe[spu->spe_id];
 }
 
-static ssize_t spu_show_temp(struct sys_device *sysdev, struct sysdev_attribute *attr,
+static ssize_t spu_show_temp(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
 	u8 value;
 	struct cbe_pmd_regs __iomem *pmd_regs;
 
-	pmd_regs = get_pmd_regs(sysdev);
+	pmd_regs = get_pmd_regs(dev);
 
-	value = spu_read_register_value(sysdev, &pmd_regs->ts_ctsr1);
+	value = spu_read_register_value(dev, &pmd_regs->ts_ctsr1);
 
 	return sprintf(buf, "%d\n", reg_to_temp(value));
 }
@@ -147,48 +147,48 @@ static ssize_t store_throttle(struct cbe_pmd_regs __iomem *pmd_regs, const char
 	return size;
 }
 
-static ssize_t spu_show_throttle_end(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t spu_show_throttle_end(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(get_pmd_regs(sysdev), buf, 0);
+	return show_throttle(get_pmd_regs(dev), buf, 0);
 }
 
-static ssize_t spu_show_throttle_begin(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t spu_show_throttle_begin(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(get_pmd_regs(sysdev), buf, 8);
+	return show_throttle(get_pmd_regs(dev), buf, 8);
 }
 
-static ssize_t spu_show_throttle_full_stop(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t spu_show_throttle_full_stop(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(get_pmd_regs(sysdev), buf, 16);
+	return show_throttle(get_pmd_regs(dev), buf, 16);
 }
 
-static ssize_t spu_store_throttle_end(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t spu_store_throttle_end(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(get_pmd_regs(sysdev), buf, size, 0);
+	return store_throttle(get_pmd_regs(dev), buf, size, 0);
 }
 
-static ssize_t spu_store_throttle_begin(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t spu_store_throttle_begin(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(get_pmd_regs(sysdev), buf, size, 8);
+	return store_throttle(get_pmd_regs(dev), buf, size, 8);
 }
 
-static ssize_t spu_store_throttle_full_stop(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t spu_store_throttle_full_stop(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(get_pmd_regs(sysdev), buf, size, 16);
+	return store_throttle(get_pmd_regs(dev), buf, size, 16);
 }
 
-static ssize_t ppe_show_temp(struct sys_device *sysdev, char *buf, int pos)
+static ssize_t ppe_show_temp(struct device *dev, char *buf, int pos)
 {
 	struct cbe_pmd_regs __iomem *pmd_regs;
 	u64 value;
 
-	pmd_regs = cbe_get_cpu_pmd_regs(sysdev->id);
+	pmd_regs = cbe_get_cpu_pmd_regs(dev->id);
 	value = in_be64(&pmd_regs->ts_ctsr2);
 
 	value = (value >> pos) & 0x3f;
@@ -199,64 +199,64 @@ static ssize_t ppe_show_temp(struct sys_device *sysdev, char *buf, int pos)
 
 /* shows the temperature of the DTS on the PPE,
  * located near the linear thermal sensor */
-static ssize_t ppe_show_temp0(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t ppe_show_temp0(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return ppe_show_temp(sysdev, buf, 32);
+	return ppe_show_temp(dev, buf, 32);
 }
 
 /* shows the temperature of the second DTS on the PPE */
-static ssize_t ppe_show_temp1(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t ppe_show_temp1(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return ppe_show_temp(sysdev, buf, 0);
+	return ppe_show_temp(dev, buf, 0);
 }
 
-static ssize_t ppe_show_throttle_end(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t ppe_show_throttle_end(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 32);
+	return show_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, 32);
 }
 
-static ssize_t ppe_show_throttle_begin(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t ppe_show_throttle_begin(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 40);
+	return show_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, 40);
 }
 
-static ssize_t ppe_show_throttle_full_stop(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t ppe_show_throttle_full_stop(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 48);
+	return show_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, 48);
 }
 
-static ssize_t ppe_store_throttle_end(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_end(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 32);
+	return store_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, size, 32);
 }
 
-static ssize_t ppe_store_throttle_begin(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_begin(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 40);
+	return store_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, size, 40);
 }
 
-static ssize_t ppe_store_throttle_full_stop(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_full_stop(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 48);
+	return store_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, size, 48);
 }
 
 
-static struct sysdev_attribute attr_spu_temperature = {
+static struct device_attribute attr_spu_temperature = {
 	.attr = {.name = "temperature", .mode = 0400 },
 	.show = spu_show_temp,
 };
 
-static SYSDEV_PREFIX_ATTR(spu, throttle_end, 0600);
-static SYSDEV_PREFIX_ATTR(spu, throttle_begin, 0600);
-static SYSDEV_PREFIX_ATTR(spu, throttle_full_stop, 0600);
+static DEVICE_PREFIX_ATTR(spu, throttle_end, 0600);
+static DEVICE_PREFIX_ATTR(spu, throttle_begin, 0600);
+static DEVICE_PREFIX_ATTR(spu, throttle_full_stop, 0600);
 
 
 static struct attribute *spu_attributes[] = {
@@ -272,19 +272,19 @@ static struct attribute_group spu_attribute_group = {
 	.attrs	= spu_attributes,
 };
 
-static struct sysdev_attribute attr_ppe_temperature0 = {
+static struct device_attribute attr_ppe_temperature0 = {
 	.attr = {.name = "temperature0", .mode = 0400 },
 	.show = ppe_show_temp0,
 };
 
-static struct sysdev_attribute attr_ppe_temperature1 = {
+static struct device_attribute attr_ppe_temperature1 = {
 	.attr = {.name = "temperature1", .mode = 0400 },
 	.show = ppe_show_temp1,
 };
 
-static SYSDEV_PREFIX_ATTR(ppe, throttle_end, 0600);
-static SYSDEV_PREFIX_ATTR(ppe, throttle_begin, 0600);
-static SYSDEV_PREFIX_ATTR(ppe, throttle_full_stop, 0600);
+static DEVICE_PREFIX_ATTR(ppe, throttle_end, 0600);
+static DEVICE_PREFIX_ATTR(ppe, throttle_begin, 0600);
+static DEVICE_PREFIX_ATTR(ppe, throttle_full_stop, 0600);
 
 static struct attribute *ppe_attributes[] = {
 	&attr_ppe_temperature0.attr,
@@ -307,7 +307,7 @@ static int __init init_default_values(void)
 {
 	int cpu;
 	struct cbe_pmd_regs __iomem *pmd_regs;
-	struct sys_device *sysdev;
+	struct device *dev;
 	union ppe_spe_reg tpr;
 	union spe_reg str1;
 	u64 str2;
@@ -349,14 +349,14 @@ static int __init init_default_values(void)
 
 	for_each_possible_cpu (cpu) {
 		pr_debug("processing cpu %d\n", cpu);
-		sysdev = get_cpu_sysdev(cpu);
+		dev = get_cpu_device(cpu);
 
-		if (!sysdev) {
-			pr_info("invalid sysdev pointer for cbe_thermal\n");
+		if (!dev) {
+			pr_info("invalid dev pointer for cbe_thermal\n");
 			return -EINVAL;
 		}
 
-		pmd_regs = cbe_get_cpu_pmd_regs(sysdev->id);
+		pmd_regs = cbe_get_cpu_pmd_regs(dev->id);
 
 		if (!pmd_regs) {
 			pr_info("invalid CBE regs pointer for cbe_thermal\n");
@@ -379,8 +379,8 @@ static int __init thermal_init(void)
 	int rc = init_default_values();
 
 	if (rc == 0) {
-		spu_add_sysdev_attr_group(&spu_attribute_group);
-		cpu_add_sysdev_attr_group(&ppe_attribute_group);
+		spu_add_dev_attr_group(&spu_attribute_group);
+		cpu_add_dev_attr_group(&ppe_attribute_group);
 	}
 
 	return rc;
@@ -389,8 +389,8 @@ module_init(thermal_init);
 
 static void __exit thermal_exit(void)
 {
-	spu_remove_sysdev_attr_group(&spu_attribute_group);
-	cpu_remove_sysdev_attr_group(&ppe_attribute_group);
+	spu_remove_dev_attr_group(&spu_attribute_group);
+	cpu_remove_dev_attr_group(&ppe_attribute_group);
 }
 module_exit(thermal_exit);
 
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 3675da73623f..1708fb7aba35 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -522,31 +522,32 @@ void spu_init_channels(struct spu *spu)
 }
 EXPORT_SYMBOL_GPL(spu_init_channels);
 
-static struct sysdev_class spu_sysdev_class = {
+static struct bus_type spu_subsys = {
 	.name = "spu",
+	.dev_name = "spu",
 };
 
-int spu_add_sysdev_attr(struct sysdev_attribute *attr)
+int spu_add_dev_attr(struct device_attribute *attr)
 {
 	struct spu *spu;
 
 	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
-		sysdev_create_file(&spu->sysdev, attr);
+		device_create_file(&spu->dev, attr);
 	mutex_unlock(&spu_full_list_mutex);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(spu_add_sysdev_attr);
+EXPORT_SYMBOL_GPL(spu_add_dev_attr);
 
-int spu_add_sysdev_attr_group(struct attribute_group *attrs)
+int spu_add_dev_attr_group(struct attribute_group *attrs)
 {
 	struct spu *spu;
 	int rc = 0;
 
 	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list) {
-		rc = sysfs_create_group(&spu->sysdev.kobj, attrs);
+		rc = sysfs_create_group(&spu->dev.kobj, attrs);
 
 		/* we're in trouble here, but try unwinding anyway */
 		if (rc) {
@@ -555,7 +556,7 @@ int spu_add_sysdev_attr_group(struct attribute_group *attrs)
 
 			list_for_each_entry_continue_reverse(spu,
 					&spu_full_list, full_list)
-				sysfs_remove_group(&spu->sysdev.kobj, attrs);
+				sysfs_remove_group(&spu->dev.kobj, attrs);
 			break;
 		}
 	}
@@ -564,45 +565,45 @@ int spu_add_sysdev_attr_group(struct attribute_group *attrs)
 
 	return rc;
 }
-EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group);
+EXPORT_SYMBOL_GPL(spu_add_dev_attr_group);
 
 
-void spu_remove_sysdev_attr(struct sysdev_attribute *attr)
+void spu_remove_dev_attr(struct device_attribute *attr)
 {
 	struct spu *spu;
 
 	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
-		sysdev_remove_file(&spu->sysdev, attr);
+		device_remove_file(&spu->dev, attr);
 	mutex_unlock(&spu_full_list_mutex);
 }
-EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr);
+EXPORT_SYMBOL_GPL(spu_remove_dev_attr);
 
-void spu_remove_sysdev_attr_group(struct attribute_group *attrs)
+void spu_remove_dev_attr_group(struct attribute_group *attrs)
 {
 	struct spu *spu;
 
 	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
-		sysfs_remove_group(&spu->sysdev.kobj, attrs);
+		sysfs_remove_group(&spu->dev.kobj, attrs);
 	mutex_unlock(&spu_full_list_mutex);
 }
-EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr_group);
+EXPORT_SYMBOL_GPL(spu_remove_dev_attr_group);
 
-static int spu_create_sysdev(struct spu *spu)
+static int spu_create_dev(struct spu *spu)
 {
 	int ret;
 
-	spu->sysdev.id = spu->number;
-	spu->sysdev.cls = &spu_sysdev_class;
-	ret = sysdev_register(&spu->sysdev);
+	spu->dev.id = spu->number;
+	spu->dev.bus = &spu_subsys;
+	ret = device_register(&spu->dev);
 	if (ret) {
 		printk(KERN_ERR "Can't register SPU %d with sysfs\n",
 				spu->number);
 		return ret;
 	}
 
-	sysfs_add_device_to_node(&spu->sysdev, spu->node);
+	sysfs_add_device_to_node(&spu->dev, spu->node);
 
 	return 0;
 }
@@ -638,7 +639,7 @@ static int __init create_spu(void *data)
 	if (ret)
 		goto out_destroy;
 
-	ret = spu_create_sysdev(spu);
+	ret = spu_create_dev(spu);
 	if (ret)
 		goto out_free_irqs;
 
@@ -695,10 +696,10 @@ static unsigned long long spu_acct_time(struct spu *spu,
 }
 
 
-static ssize_t spu_stat_show(struct sys_device *sysdev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t spu_stat_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
-	struct spu *spu = container_of(sysdev, struct spu, sysdev);
+	struct spu *spu = container_of(dev, struct spu, dev);
 
 	return sprintf(buf, "%s %llu %llu %llu %llu "
 		      "%llu %llu %llu %llu %llu %llu %llu %llu\n",
@@ -717,7 +718,7 @@ static ssize_t spu_stat_show(struct sys_device *sysdev,
 		spu->stats.libassist);
 }
 
-static SYSDEV_ATTR(stat, 0644, spu_stat_show, NULL);
+static DEVICE_ATTR(stat, 0644, spu_stat_show, NULL);
 
 #ifdef CONFIG_KEXEC
 
@@ -816,8 +817,8 @@ static int __init init_spu_base(void)
 	if (!spu_management_ops)
 		goto out;
 
-	/* create sysdev class for spus */
-	ret = sysdev_class_register(&spu_sysdev_class);
+	/* create system subsystem for spus */
+	ret = subsys_system_register(&spu_subsys, NULL);
 	if (ret)
 		goto out;
 
@@ -826,7 +827,7 @@ static int __init init_spu_base(void)
 	if (ret < 0) {
 		printk(KERN_WARNING "%s: Error initializing spus\n",
 			__func__);
-		goto out_unregister_sysdev_class;
+		goto out_unregister_subsys;
 	}
 
 	if (ret > 0)
@@ -836,15 +837,15 @@ static int __init init_spu_base(void)
 	xmon_register_spus(&spu_full_list);
 	crash_register_spus(&spu_full_list);
 	mutex_unlock(&spu_full_list_mutex);
-	spu_add_sysdev_attr(&attr_stat);
+	spu_add_dev_attr(&dev_attr_stat);
 	register_syscore_ops(&spu_syscore_ops);
 
 	spu_init_affinity();
 
 	return 0;
 
- out_unregister_sysdev_class:
-	sysdev_class_unregister(&spu_sysdev_class);
+ out_unregister_subsys:
+	bus_unregister(&spu_subsys);
  out:
 	return ret;
 }
diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c
index c8b3c69fe891..af281dce510a 100644
--- a/arch/powerpc/platforms/pseries/pseries_energy.c
+++ b/arch/powerpc/platforms/pseries/pseries_energy.c
@@ -15,7 +15,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/seq_file.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/of.h>
 #include <asm/cputhreads.h>
@@ -184,7 +184,7 @@ static ssize_t get_best_energy_list(char *page, int activate)
 	return s-page;
 }
 
-static ssize_t get_best_energy_data(struct sys_device *dev,
+static ssize_t get_best_energy_data(struct device *dev,
 					char *page, int activate)
 {
 	int rc;
@@ -207,26 +207,26 @@ static ssize_t get_best_energy_data(struct sys_device *dev,
 
 /* Wrapper functions */
 
-static ssize_t cpu_activate_hint_list_show(struct sysdev_class *class,
-			struct sysdev_class_attribute *attr, char *page)
+static ssize_t cpu_activate_hint_list_show(struct device *dev,
+			struct device_attribute *attr, char *page)
 {
 	return get_best_energy_list(page, 1);
 }
 
-static ssize_t cpu_deactivate_hint_list_show(struct sysdev_class *class,
-			struct sysdev_class_attribute *attr, char *page)
+static ssize_t cpu_deactivate_hint_list_show(struct device *dev,
+			struct device_attribute *attr, char *page)
 {
 	return get_best_energy_list(page, 0);
 }
 
-static ssize_t percpu_activate_hint_show(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *page)
+static ssize_t percpu_activate_hint_show(struct device *dev,
+			struct device_attribute *attr, char *page)
 {
 	return get_best_energy_data(dev, page, 1);
 }
 
-static ssize_t percpu_deactivate_hint_show(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *page)
+static ssize_t percpu_deactivate_hint_show(struct device *dev,
+			struct device_attribute *attr, char *page)
 {
 	return get_best_energy_data(dev, page, 0);
 }
@@ -241,48 +241,48 @@ static ssize_t percpu_deactivate_hint_show(struct sys_device *dev,
  *	Per-cpu value of the hint
  */
 
-struct sysdev_class_attribute attr_cpu_activate_hint_list =
-		_SYSDEV_CLASS_ATTR(pseries_activate_hint_list, 0444,
+struct device_attribute attr_cpu_activate_hint_list =
+		__ATTR(pseries_activate_hint_list, 0444,
 		cpu_activate_hint_list_show, NULL);
 
-struct sysdev_class_attribute attr_cpu_deactivate_hint_list =
-		_SYSDEV_CLASS_ATTR(pseries_deactivate_hint_list, 0444,
+struct device_attribute attr_cpu_deactivate_hint_list =
+		__ATTR(pseries_deactivate_hint_list, 0444,
 		cpu_deactivate_hint_list_show, NULL);
 
-struct sysdev_attribute attr_percpu_activate_hint =
-		_SYSDEV_ATTR(pseries_activate_hint, 0444,
+struct device_attribute attr_percpu_activate_hint =
+		__ATTR(pseries_activate_hint, 0444,
 		percpu_activate_hint_show, NULL);
 
-struct sysdev_attribute attr_percpu_deactivate_hint =
-		_SYSDEV_ATTR(pseries_deactivate_hint, 0444,
+struct device_attribute attr_percpu_deactivate_hint =
+		__ATTR(pseries_deactivate_hint, 0444,
 		percpu_deactivate_hint_show, NULL);
 
 static int __init pseries_energy_init(void)
 {
 	int cpu, err;
-	struct sys_device *cpu_sys_dev;
+	struct device *cpu_dev;
 
 	if (!check_for_h_best_energy()) {
 		printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n");
 		return 0;
 	}
 	/* Create the sysfs files */
-	err = sysfs_create_file(&cpu_sysdev_class.kset.kobj,
-				&attr_cpu_activate_hint_list.attr);
+	err = device_create_file(cpu_subsys.dev_root,
+				&attr_cpu_activate_hint_list);
 	if (!err)
-		err = sysfs_create_file(&cpu_sysdev_class.kset.kobj,
-				&attr_cpu_deactivate_hint_list.attr);
+		err = device_create_file(cpu_subsys.dev_root,
+				&attr_cpu_deactivate_hint_list);
 
 	if (err)
 		return err;
 	for_each_possible_cpu(cpu) {
-		cpu_sys_dev = get_cpu_sysdev(cpu);
-		err = sysfs_create_file(&cpu_sys_dev->kobj,
-				&attr_percpu_activate_hint.attr);
+		cpu_dev = get_cpu_device(cpu);
+		err = device_create_file(cpu_dev,
+				&attr_percpu_activate_hint);
 		if (err)
 			break;
-		err = sysfs_create_file(&cpu_sys_dev->kobj,
-				&attr_percpu_deactivate_hint.attr);
+		err = device_create_file(cpu_dev,
+				&attr_percpu_deactivate_hint);
 		if (err)
 			break;
 	}
@@ -298,23 +298,20 @@ static int __init pseries_energy_init(void)
 static void __exit pseries_energy_cleanup(void)
 {
 	int cpu;
-	struct sys_device *cpu_sys_dev;
+	struct device *cpu_dev;
 
 	if (!sysfs_entries)
 		return;
 
 	/* Remove the sysfs files */
-	sysfs_remove_file(&cpu_sysdev_class.kset.kobj,
-				&attr_cpu_activate_hint_list.attr);
-
-	sysfs_remove_file(&cpu_sysdev_class.kset.kobj,
-				&attr_cpu_deactivate_hint_list.attr);
+	device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list);
+	device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list);
 
 	for_each_possible_cpu(cpu) {
-		cpu_sys_dev = get_cpu_sysdev(cpu);
-		sysfs_remove_file(&cpu_sys_dev->kobj,
+		cpu_dev = get_cpu_device(cpu);
+		sysfs_remove_file(&cpu_dev->kobj,
 				&attr_percpu_activate_hint.attr);
-		sysfs_remove_file(&cpu_sys_dev->kobj,
+		sysfs_remove_file(&cpu_dev->kobj,
 				&attr_percpu_deactivate_hint.attr);
 	}
 }
diff --git a/arch/powerpc/sysdev/ppc4xx_cpm.c b/arch/powerpc/sysdev/ppc4xx_cpm.c
index 73b86cc5ea74..82e2cfe35c62 100644
--- a/arch/powerpc/sysdev/ppc4xx_cpm.c
+++ b/arch/powerpc/sysdev/ppc4xx_cpm.c
@@ -179,12 +179,12 @@ static struct kobj_attribute cpm_idle_attr =
 
 static void cpm_idle_config_sysfs(void)
 {
-	struct sys_device *sys_dev;
+	struct device *dev;
 	unsigned long ret;
 
-	sys_dev = get_cpu_sysdev(0);
+	dev = get_cpu_device(0);
 
-	ret = sysfs_create_file(&sys_dev->kobj,
+	ret = sysfs_create_file(&dev->kobj,
 				&cpm_idle_attr.attr);
 	if (ret)
 		printk(KERN_WARNING
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 3ea872890da2..66cca03c0282 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -831,8 +831,8 @@ int setup_profiling_timer(unsigned int multiplier)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static ssize_t cpu_configure_show(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t cpu_configure_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	ssize_t count;
 
@@ -842,8 +842,8 @@ static ssize_t cpu_configure_show(struct sys_device *dev,
 	return count;
 }
 
-static ssize_t cpu_configure_store(struct sys_device *dev,
-				  struct sysdev_attribute *attr,
+static ssize_t cpu_configure_store(struct device *dev,
+				  struct device_attribute *attr,
 				  const char *buf, size_t count)
 {
 	int cpu = dev->id;
@@ -889,11 +889,11 @@ out:
 	put_online_cpus();
 	return rc ? rc : count;
 }
-static SYSDEV_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
+static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static ssize_t cpu_polarization_show(struct sys_device *dev,
-				     struct sysdev_attribute *attr, char *buf)
+static ssize_t cpu_polarization_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
 {
 	int cpu = dev->id;
 	ssize_t count;
@@ -919,22 +919,22 @@ static ssize_t cpu_polarization_show(struct sys_device *dev,
 	mutex_unlock(&smp_cpu_state_mutex);
 	return count;
 }
-static SYSDEV_ATTR(polarization, 0444, cpu_polarization_show, NULL);
+static DEVICE_ATTR(polarization, 0444, cpu_polarization_show, NULL);
 
-static ssize_t show_cpu_address(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_cpu_address(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%d\n", __cpu_logical_map[dev->id]);
 }
-static SYSDEV_ATTR(address, 0444, show_cpu_address, NULL);
+static DEVICE_ATTR(address, 0444, show_cpu_address, NULL);
 
 
 static struct attribute *cpu_common_attrs[] = {
 #ifdef CONFIG_HOTPLUG_CPU
-	&attr_configure.attr,
+	&dev_attr_configure.attr,
 #endif
-	&attr_address.attr,
-	&attr_polarization.attr,
+	&dev_attr_address.attr,
+	&dev_attr_polarization.attr,
 	NULL,
 };
 
@@ -942,8 +942,8 @@ static struct attribute_group cpu_common_attr_group = {
 	.attrs = cpu_common_attrs,
 };
 
-static ssize_t show_capability(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_capability(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	unsigned int capability;
 	int rc;
@@ -953,10 +953,10 @@ static ssize_t show_capability(struct sys_device *dev,
 		return rc;
 	return sprintf(buf, "%u\n", capability);
 }
-static SYSDEV_ATTR(capability, 0444, show_capability, NULL);
+static DEVICE_ATTR(capability, 0444, show_capability, NULL);
 
-static ssize_t show_idle_count(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_idle_count(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	struct s390_idle_data *idle;
 	unsigned long long idle_count;
@@ -976,10 +976,10 @@ repeat:
 		goto repeat;
 	return sprintf(buf, "%llu\n", idle_count);
 }
-static SYSDEV_ATTR(idle_count, 0444, show_idle_count, NULL);
+static DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL);
 
-static ssize_t show_idle_time(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_idle_time(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	struct s390_idle_data *idle;
 	unsigned long long now, idle_time, idle_enter;
@@ -1001,12 +1001,12 @@ repeat:
 		goto repeat;
 	return sprintf(buf, "%llu\n", idle_time >> 12);
 }
-static SYSDEV_ATTR(idle_time_us, 0444, show_idle_time, NULL);
+static DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL);
 
 static struct attribute *cpu_online_attrs[] = {
-	&attr_capability.attr,
-	&attr_idle_count.attr,
-	&attr_idle_time_us.attr,
+	&dev_attr_capability.attr,
+	&dev_attr_idle_count.attr,
+	&dev_attr_idle_time_us.attr,
 	NULL,
 };
 
@@ -1019,7 +1019,7 @@ static int __cpuinit smp_cpu_notify(struct notifier_block *self,
 {
 	unsigned int cpu = (unsigned int)(long)hcpu;
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
+	struct device *s = &c->dev;
 	struct s390_idle_data *idle;
 	int err = 0;
 
@@ -1045,7 +1045,7 @@ static struct notifier_block __cpuinitdata smp_cpu_nb = {
 static int __devinit smp_add_present_cpu(int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
+	struct device *s = &c->dev;
 	int rc;
 
 	c->hotpluggable = 1;
@@ -1098,8 +1098,8 @@ out:
 	return rc;
 }
 
-static ssize_t __ref rescan_store(struct sysdev_class *class,
-				  struct sysdev_class_attribute *attr,
+static ssize_t __ref rescan_store(struct device *dev,
+				  struct device_attribute *attr,
 				  const char *buf,
 				  size_t count)
 {
@@ -1108,11 +1108,11 @@ static ssize_t __ref rescan_store(struct sysdev_class *class,
 	rc = smp_rescan_cpus();
 	return rc ? rc : count;
 }
-static SYSDEV_CLASS_ATTR(rescan, 0200, NULL, rescan_store);
+static DEVICE_ATTR(rescan, 0200, NULL, rescan_store);
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static ssize_t dispatching_show(struct sysdev_class *class,
-				struct sysdev_class_attribute *attr,
+static ssize_t dispatching_show(struct device *dev,
+				struct device_attribute *attr,
 				char *buf)
 {
 	ssize_t count;
@@ -1123,8 +1123,8 @@ static ssize_t dispatching_show(struct sysdev_class *class,
 	return count;
 }
 
-static ssize_t dispatching_store(struct sysdev_class *dev,
-				 struct sysdev_class_attribute *attr,
+static ssize_t dispatching_store(struct device *dev,
+				 struct device_attribute *attr,
 				 const char *buf,
 				 size_t count)
 {
@@ -1148,7 +1148,7 @@ out:
 	put_online_cpus();
 	return rc ? rc : count;
 }
-static SYSDEV_CLASS_ATTR(dispatching, 0644, dispatching_show,
+static DEVICE_ATTR(dispatching, 0644, dispatching_show,
 			 dispatching_store);
 
 static int __init topology_init(void)
@@ -1159,11 +1159,11 @@ static int __init topology_init(void)
 	register_cpu_notifier(&smp_cpu_nb);
 
 #ifdef CONFIG_HOTPLUG_CPU
-	rc = sysdev_class_create_file(&cpu_sysdev_class, &attr_rescan);
+	rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan);
 	if (rc)
 		return rc;
 #endif
-	rc = sysdev_class_create_file(&cpu_sysdev_class, &attr_dispatching);
+	rc = device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching);
 	if (rc)
 		return rc;
 	for_each_present_cpu(cpu) {
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 77b8942b9a15..6dfc524c31aa 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -230,7 +230,7 @@ void store_topology(struct sysinfo_15_1_x *info)
 int arch_update_cpu_topology(void)
 {
 	struct sysinfo_15_1_x *info = tl_info;
-	struct sys_device *sysdev;
+	struct device *dev;
 	int cpu;
 
 	if (!MACHINE_HAS_TOPOLOGY) {
@@ -242,8 +242,8 @@ int arch_update_cpu_topology(void)
 	tl_to_cores(info);
 	update_cpu_core_map();
 	for_each_online_cpu(cpu) {
-		sysdev = get_cpu_sysdev(cpu);
-		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
+		dev = get_cpu_device(cpu);
+		kobject_uevent(&dev->kobj, KOBJ_CHANGE);
 	}
 	return 1;
 }
diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c
index f0907995b4c9..a8140f0bbf6c 100644
--- a/arch/sh/kernel/cpu/sh4/sq.c
+++ b/arch/sh/kernel/cpu/sh4/sq.c
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/bitmap.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -337,9 +337,9 @@ static struct kobj_type ktype_percpu_entry = {
 	.default_attrs	= sq_sysfs_attrs,
 };
 
-static int __devinit sq_sysdev_add(struct sys_device *sysdev)
+static int __devinit sq_dev_add(struct device *dev)
 {
-	unsigned int cpu = sysdev->id;
+	unsigned int cpu = dev->id;
 	struct kobject *kobj;
 	int error;
 
@@ -348,25 +348,27 @@ static int __devinit sq_sysdev_add(struct sys_device *sysdev)
 		return -ENOMEM;
 
 	kobj = sq_kobject[cpu];
-	error = kobject_init_and_add(kobj, &ktype_percpu_entry, &sysdev->kobj,
+	error = kobject_init_and_add(kobj, &ktype_percpu_entry, &dev->kobj,
 				     "%s", "sq");
 	if (!error)
 		kobject_uevent(kobj, KOBJ_ADD);
 	return error;
 }
 
-static int __devexit sq_sysdev_remove(struct sys_device *sysdev)
+static int __devexit sq_dev_remove(struct device *dev)
 {
-	unsigned int cpu = sysdev->id;
+	unsigned int cpu = dev->id;
 	struct kobject *kobj = sq_kobject[cpu];
 
 	kobject_put(kobj);
 	return 0;
 }
 
-static struct sysdev_driver sq_sysdev_driver = {
-	.add		= sq_sysdev_add,
-	.remove		= __devexit_p(sq_sysdev_remove),
+static struct subsys_interface sq_interface = {
+	.name		= "sq"
+	.subsys		= &cpu_subsys,
+	.add_dev	= sq_dev_add,
+	.remove_dev	= __devexit_p(sq_dev_remove),
 };
 
 static int __init sq_api_init(void)
@@ -386,7 +388,7 @@ static int __init sq_api_init(void)
 	if (unlikely(!sq_bitmap))
 		goto out;
 
-	ret = sysdev_driver_register(&cpu_sysdev_class, &sq_sysdev_driver);
+	ret = subsys_interface_register(&sq_interface);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -401,7 +403,7 @@ out:
 
 static void __exit sq_api_exit(void)
 {
-	sysdev_driver_unregister(&cpu_sysdev_class, &sq_sysdev_driver);
+	subsys_interface_unregister(&sq_interface);
 	kfree(sq_bitmap);
 	kmem_cache_destroy(sq_cache);
 }
diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index 7408201d7efb..654e8aad3bbe 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -3,7 +3,7 @@
  * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
  */
 #include <linux/sched.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
@@ -16,13 +16,13 @@
 static DEFINE_PER_CPU(struct hv_mmu_statistics, mmu_stats) __attribute__((aligned(64)));
 
 #define SHOW_MMUSTAT_ULONG(NAME) \
-static ssize_t show_##NAME(struct sys_device *dev, \
-			struct sysdev_attribute *attr, char *buf) \
+static ssize_t show_##NAME(struct device *dev, \
+			struct device_attribute *attr, char *buf) \
 { \
 	struct hv_mmu_statistics *p = &per_cpu(mmu_stats, dev->id); \
 	return sprintf(buf, "%lu\n", p->NAME); \
 } \
-static SYSDEV_ATTR(NAME, 0444, show_##NAME, NULL)
+static DEVICE_ATTR(NAME, 0444, show_##NAME, NULL)
 
 SHOW_MMUSTAT_ULONG(immu_tsb_hits_ctx0_8k_tte);
 SHOW_MMUSTAT_ULONG(immu_tsb_ticks_ctx0_8k_tte);
@@ -58,38 +58,38 @@ SHOW_MMUSTAT_ULONG(dmmu_tsb_hits_ctxnon0_256mb_tte);
 SHOW_MMUSTAT_ULONG(dmmu_tsb_ticks_ctxnon0_256mb_tte);
 
 static struct attribute *mmu_stat_attrs[] = {
-	&attr_immu_tsb_hits_ctx0_8k_tte.attr,
-	&attr_immu_tsb_ticks_ctx0_8k_tte.attr,
-	&attr_immu_tsb_hits_ctx0_64k_tte.attr,
-	&attr_immu_tsb_ticks_ctx0_64k_tte.attr,
-	&attr_immu_tsb_hits_ctx0_4mb_tte.attr,
-	&attr_immu_tsb_ticks_ctx0_4mb_tte.attr,
-	&attr_immu_tsb_hits_ctx0_256mb_tte.attr,
-	&attr_immu_tsb_ticks_ctx0_256mb_tte.attr,
-	&attr_immu_tsb_hits_ctxnon0_8k_tte.attr,
-	&attr_immu_tsb_ticks_ctxnon0_8k_tte.attr,
-	&attr_immu_tsb_hits_ctxnon0_64k_tte.attr,
-	&attr_immu_tsb_ticks_ctxnon0_64k_tte.attr,
-	&attr_immu_tsb_hits_ctxnon0_4mb_tte.attr,
-	&attr_immu_tsb_ticks_ctxnon0_4mb_tte.attr,
-	&attr_immu_tsb_hits_ctxnon0_256mb_tte.attr,
-	&attr_immu_tsb_ticks_ctxnon0_256mb_tte.attr,
-	&attr_dmmu_tsb_hits_ctx0_8k_tte.attr,
-	&attr_dmmu_tsb_ticks_ctx0_8k_tte.attr,
-	&attr_dmmu_tsb_hits_ctx0_64k_tte.attr,
-	&attr_dmmu_tsb_ticks_ctx0_64k_tte.attr,
-	&attr_dmmu_tsb_hits_ctx0_4mb_tte.attr,
-	&attr_dmmu_tsb_ticks_ctx0_4mb_tte.attr,
-	&attr_dmmu_tsb_hits_ctx0_256mb_tte.attr,
-	&attr_dmmu_tsb_ticks_ctx0_256mb_tte.attr,
-	&attr_dmmu_tsb_hits_ctxnon0_8k_tte.attr,
-	&attr_dmmu_tsb_ticks_ctxnon0_8k_tte.attr,
-	&attr_dmmu_tsb_hits_ctxnon0_64k_tte.attr,
-	&attr_dmmu_tsb_ticks_ctxnon0_64k_tte.attr,
-	&attr_dmmu_tsb_hits_ctxnon0_4mb_tte.attr,
-	&attr_dmmu_tsb_ticks_ctxnon0_4mb_tte.attr,
-	&attr_dmmu_tsb_hits_ctxnon0_256mb_tte.attr,
-	&attr_dmmu_tsb_ticks_ctxnon0_256mb_tte.attr,
+	&dev_attr_immu_tsb_hits_ctx0_8k_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctx0_8k_tte.attr,
+	&dev_attr_immu_tsb_hits_ctx0_64k_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctx0_64k_tte.attr,
+	&dev_attr_immu_tsb_hits_ctx0_4mb_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctx0_4mb_tte.attr,
+	&dev_attr_immu_tsb_hits_ctx0_256mb_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctx0_256mb_tte.attr,
+	&dev_attr_immu_tsb_hits_ctxnon0_8k_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctxnon0_8k_tte.attr,
+	&dev_attr_immu_tsb_hits_ctxnon0_64k_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctxnon0_64k_tte.attr,
+	&dev_attr_immu_tsb_hits_ctxnon0_4mb_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctxnon0_4mb_tte.attr,
+	&dev_attr_immu_tsb_hits_ctxnon0_256mb_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctxnon0_256mb_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctx0_8k_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctx0_8k_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctx0_64k_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctx0_64k_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctx0_4mb_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctx0_4mb_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctx0_256mb_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctx0_256mb_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctxnon0_8k_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctxnon0_8k_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctxnon0_64k_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctxnon0_64k_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctxnon0_4mb_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctxnon0_4mb_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctxnon0_256mb_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctxnon0_256mb_tte.attr,
 	NULL,
 };
 
@@ -139,15 +139,15 @@ static unsigned long write_mmustat_enable(unsigned long val)
 	return sun4v_mmustat_conf(ra, &orig_ra);
 }
 
-static ssize_t show_mmustat_enable(struct sys_device *s,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_mmustat_enable(struct device *s,
+				struct device_attribute *attr, char *buf)
 {
 	unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0);
 	return sprintf(buf, "%lx\n", val);
 }
 
-static ssize_t store_mmustat_enable(struct sys_device *s,
-			struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_mmustat_enable(struct device *s,
+			struct device_attribute *attr, const char *buf,
 			size_t count)
 {
 	unsigned long val, err;
@@ -163,39 +163,39 @@ static ssize_t store_mmustat_enable(struct sys_device *s,
 	return count;
 }
 
-static SYSDEV_ATTR(mmustat_enable, 0644, show_mmustat_enable, store_mmustat_enable);
+static DEVICE_ATTR(mmustat_enable, 0644, show_mmustat_enable, store_mmustat_enable);
 
 static int mmu_stats_supported;
 
-static int register_mmu_stats(struct sys_device *s)
+static int register_mmu_stats(struct device *s)
 {
 	if (!mmu_stats_supported)
 		return 0;
-	sysdev_create_file(s, &attr_mmustat_enable);
+	device_create_file(s, &dev_attr_mmustat_enable);
 	return sysfs_create_group(&s->kobj, &mmu_stat_group);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void unregister_mmu_stats(struct sys_device *s)
+static void unregister_mmu_stats(struct device *s)
 {
 	if (!mmu_stats_supported)
 		return;
 	sysfs_remove_group(&s->kobj, &mmu_stat_group);
-	sysdev_remove_file(s, &attr_mmustat_enable);
+	device_remove_file(s, &dev_attr_mmustat_enable);
 }
 #endif
 
 #define SHOW_CPUDATA_ULONG_NAME(NAME, MEMBER) \
-static ssize_t show_##NAME(struct sys_device *dev, \
-		struct sysdev_attribute *attr, char *buf) \
+static ssize_t show_##NAME(struct device *dev, \
+		struct device_attribute *attr, char *buf) \
 { \
 	cpuinfo_sparc *c = &cpu_data(dev->id); \
 	return sprintf(buf, "%lu\n", c->MEMBER); \
 }
 
 #define SHOW_CPUDATA_UINT_NAME(NAME, MEMBER) \
-static ssize_t show_##NAME(struct sys_device *dev, \
-		struct sysdev_attribute *attr, char *buf) \
+static ssize_t show_##NAME(struct device *dev, \
+		struct device_attribute *attr, char *buf) \
 { \
 	cpuinfo_sparc *c = &cpu_data(dev->id); \
 	return sprintf(buf, "%u\n", c->MEMBER); \
@@ -209,14 +209,14 @@ SHOW_CPUDATA_UINT_NAME(l1_icache_line_size, icache_line_size);
 SHOW_CPUDATA_UINT_NAME(l2_cache_size, ecache_size);
 SHOW_CPUDATA_UINT_NAME(l2_cache_line_size, ecache_line_size);
 
-static struct sysdev_attribute cpu_core_attrs[] = {
-	_SYSDEV_ATTR(clock_tick,          0444, show_clock_tick, NULL),
-	_SYSDEV_ATTR(l1_dcache_size,      0444, show_l1_dcache_size, NULL),
-	_SYSDEV_ATTR(l1_dcache_line_size, 0444, show_l1_dcache_line_size, NULL),
-	_SYSDEV_ATTR(l1_icache_size,      0444, show_l1_icache_size, NULL),
-	_SYSDEV_ATTR(l1_icache_line_size, 0444, show_l1_icache_line_size, NULL),
-	_SYSDEV_ATTR(l2_cache_size,       0444, show_l2_cache_size, NULL),
-	_SYSDEV_ATTR(l2_cache_line_size,  0444, show_l2_cache_line_size, NULL),
+static struct device_attribute cpu_core_attrs[] = {
+	__ATTR(clock_tick,          0444, show_clock_tick, NULL),
+	__ATTR(l1_dcache_size,      0444, show_l1_dcache_size, NULL),
+	__ATTR(l1_dcache_line_size, 0444, show_l1_dcache_line_size, NULL),
+	__ATTR(l1_icache_size,      0444, show_l1_icache_size, NULL),
+	__ATTR(l1_icache_line_size, 0444, show_l1_icache_line_size, NULL),
+	__ATTR(l2_cache_size,       0444, show_l2_cache_size, NULL),
+	__ATTR(l2_cache_line_size,  0444, show_l2_cache_line_size, NULL),
 };
 
 static DEFINE_PER_CPU(struct cpu, cpu_devices);
@@ -224,11 +224,11 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
 static void register_cpu_online(unsigned int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
+	struct device *s = &c->dev;
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(cpu_core_attrs); i++)
-		sysdev_create_file(s, &cpu_core_attrs[i]);
+		device_create_file(s, &cpu_core_attrs[i]);
 
 	register_mmu_stats(s);
 }
@@ -237,12 +237,12 @@ static void register_cpu_online(unsigned int cpu)
 static void unregister_cpu_online(unsigned int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
+	struct device *s = &c->dev;
 	int i;
 
 	unregister_mmu_stats(s);
 	for (i = 0; i < ARRAY_SIZE(cpu_core_attrs); i++)
-		sysdev_remove_file(s, &cpu_core_attrs[i]);
+		device_remove_file(s, &cpu_core_attrs[i]);
 }
 #endif
 
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index b671a86f4515..e7ce2a5161b8 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -14,7 +14,7 @@
  * /sys entry support.
  */
 
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
@@ -31,55 +31,55 @@ static ssize_t get_hv_confstr(char *page, int query)
 	return n;
 }
 
-static ssize_t chip_width_show(struct sysdev_class *dev,
-			       struct sysdev_class_attribute *attr,
+static ssize_t chip_width_show(struct device *dev,
+			       struct device_attribute *attr,
 			       char *page)
 {
 	return sprintf(page, "%u\n", smp_width);
 }
-static SYSDEV_CLASS_ATTR(chip_width, 0444, chip_width_show, NULL);
+static DEVICE_ATTR(chip_width, 0444, chip_width_show, NULL);
 
-static ssize_t chip_height_show(struct sysdev_class *dev,
-				struct sysdev_class_attribute *attr,
+static ssize_t chip_height_show(struct device *dev,
+				struct device_attribute *attr,
 				char *page)
 {
 	return sprintf(page, "%u\n", smp_height);
 }
-static SYSDEV_CLASS_ATTR(chip_height, 0444, chip_height_show, NULL);
+static DEVICE_ATTR(chip_height, 0444, chip_height_show, NULL);
 
-static ssize_t chip_serial_show(struct sysdev_class *dev,
-				struct sysdev_class_attribute *attr,
+static ssize_t chip_serial_show(struct device *dev,
+				struct device_attribute *attr,
 				char *page)
 {
 	return get_hv_confstr(page, HV_CONFSTR_CHIP_SERIAL_NUM);
 }
-static SYSDEV_CLASS_ATTR(chip_serial, 0444, chip_serial_show, NULL);
+static DEVICE_ATTR(chip_serial, 0444, chip_serial_show, NULL);
 
-static ssize_t chip_revision_show(struct sysdev_class *dev,
-				  struct sysdev_class_attribute *attr,
+static ssize_t chip_revision_show(struct device *dev,
+				  struct device_attribute *attr,
 				  char *page)
 {
 	return get_hv_confstr(page, HV_CONFSTR_CHIP_REV);
 }
-static SYSDEV_CLASS_ATTR(chip_revision, 0444, chip_revision_show, NULL);
+static DEVICE_ATTR(chip_revision, 0444, chip_revision_show, NULL);
 
 
-static ssize_t type_show(struct sysdev_class *dev,
-			    struct sysdev_class_attribute *attr,
+static ssize_t type_show(struct device *dev,
+			    struct device_attribute *attr,
 			    char *page)
 {
 	return sprintf(page, "tilera\n");
 }
-static SYSDEV_CLASS_ATTR(type, 0444, type_show, NULL);
+static DEVICE_ATTR(type, 0444, type_show, NULL);
 
 #define HV_CONF_ATTR(name, conf)					\
-	static ssize_t name ## _show(struct sysdev_class *dev,		\
-				     struct sysdev_class_attribute *attr, \
+	static ssize_t name ## _show(struct device *dev,		\
+				     struct device_attribute *attr, \
 				     char *page)			\
 	{								\
 		return get_hv_confstr(page, conf);			\
 	}								\
-	static SYSDEV_CLASS_ATTR(name, 0444, name ## _show, NULL);
+	static DEVICE_ATTR(name, 0444, name ## _show, NULL);
 
 HV_CONF_ATTR(version,		HV_CONFSTR_HV_SW_VER)
 HV_CONF_ATTR(config_version,	HV_CONFSTR_HV_CONFIG_VER)
@@ -95,15 +95,15 @@ HV_CONF_ATTR(mezz_description,	HV_CONFSTR_MEZZ_DESC)
 HV_CONF_ATTR(switch_control,	HV_CONFSTR_SWITCH_CONTROL)
 
 static struct attribute *board_attrs[] = {
-	&attr_board_part.attr,
-	&attr_board_serial.attr,
-	&attr_board_revision.attr,
-	&attr_board_description.attr,
-	&attr_mezz_part.attr,
-	&attr_mezz_serial.attr,
-	&attr_mezz_revision.attr,
-	&attr_mezz_description.attr,
-	&attr_switch_control.attr,
+	&dev_attr_board_part.attr,
+	&dev_attr_board_serial.attr,
+	&dev_attr_board_revision.attr,
+	&dev_attr_board_description.attr,
+	&dev_attr_mezz_part.attr,
+	&dev_attr_mezz_serial.attr,
+	&dev_attr_mezz_revision.attr,
+	&dev_attr_mezz_description.attr,
+	&dev_attr_switch_control.attr,
 	NULL
 };
 
@@ -150,12 +150,11 @@ hvconfig_bin_read(struct file *filp, struct kobject *kobj,
 
 static int __init create_sysfs_entries(void)
 {
-	struct sysdev_class *cls = &cpu_sysdev_class;
 	int err = 0;
 
 #define create_cpu_attr(name)						\
 	if (!err)							\
-		err = sysfs_create_file(&cls->kset.kobj, &attr_##name.attr);
+		err = device_create_file(cpu_subsys.dev_root, &dev_attr_##name);
 	create_cpu_attr(chip_width);
 	create_cpu_attr(chip_height);
 	create_cpu_attr(chip_serial);
@@ -163,7 +162,7 @@ static int __init create_sysfs_entries(void)
 
 #define create_hv_attr(name)						\
 	if (!err)							\
-		err = sysfs_create_file(hypervisor_kobj, &attr_##name.attr);
+		err = sysfs_create_file(hypervisor_kobj, &dev_attr_##name);
 	create_hv_attr(type);
 	create_hv_attr(version);
 	create_hv_attr(config_version);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c9321f34e55b..0b05fb49c560 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -149,7 +149,7 @@ static inline void enable_p5_mce(void) {}
 
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, mce_sysdev);
+DECLARE_PER_CPU(struct device, mce_device);
 
 /*
  * Maximum banks number.
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a3b0811693c9..6b45e5e7a901 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
 
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
-
-extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
+#include <linux/cpu.h>
 
 /* pointer to kobject for cpuX/cache */
 static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
@@ -1073,9 +1072,9 @@ err_out:
 static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
 
 /* Add/Remove cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
+static int __cpuinit cache_add_dev(struct device *dev)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	unsigned long i, j;
 	struct _index_kobject *this_object;
 	struct _cpuid4_info   *this_leaf;
@@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 
 	retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
 				      &ktype_percpu_entry,
-				      &sys_dev->kobj, "%s", "cache");
+				      &dev->kobj, "%s", "cache");
 	if (retval < 0) {
 		cpuid4_cache_sysfs_exit(cpu);
 		return retval;
@@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 	return 0;
 }
 
-static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
+static void __cpuinit cache_remove_dev(struct device *dev)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	unsigned long i;
 
 	if (per_cpu(ici_cpuid4_info, cpu) == NULL)
@@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	dev = get_cpu_device(cpu);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		cache_add_dev(sys_dev);
+		cache_add_dev(dev);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		cache_remove_dev(sys_dev);
+		cache_remove_dev(dev);
 		break;
 	}
 	return NOTIFY_OK;
@@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void)
 
 	for_each_online_cpu(i) {
 		int err;
-		struct sys_device *sys_dev = get_cpu_sysdev(i);
+		struct device *dev = get_cpu_device(i);
 
-		err = cache_add_dev(sys_dev);
+		err = cache_add_dev(dev);
 		if (err)
 			return err;
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fefcc69ee8b5..ed44c8a65858 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,4 +1,4 @@
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <asm/mce.h>
 
 enum severity_level {
@@ -17,7 +17,7 @@ enum severity_level {
 struct mce_bank {
 	u64			ctl;			/* subevents to enable */
 	unsigned char init;				/* initialise bank? */
-	struct sysdev_attribute attr;			/* sysdev attribute */
+	struct device_attribute attr;			/* device attribute */
 	char			attrname[ATTR_LEN];	/* attribute name */
 };
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 362056aefeb4..0156c6f85d7b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -19,7 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/string.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
@@ -1751,7 +1751,7 @@ static struct syscore_ops mce_syscore_ops = {
 };
 
 /*
- * mce_sysdev: Sysfs support
+ * mce_device: Sysfs support
  */
 
 static void mce_cpu_restart(void *data)
@@ -1787,27 +1787,28 @@ static void mce_enable_ce(void *all)
 		__mcheck_cpu_init_timer();
 }
 
-static struct sysdev_class mce_sysdev_class = {
+static struct bus_type mce_subsys = {
 	.name		= "machinecheck",
+	.dev_name	= "machinecheck",
 };
 
-DEFINE_PER_CPU(struct sys_device, mce_sysdev);
+DEFINE_PER_CPU(struct device, mce_device);
 
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
-static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
+static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
 {
 	return container_of(attr, struct mce_bank, attr);
 }
 
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
 			 char *buf)
 {
 	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
 }
 
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
 			const char *buf, size_t size)
 {
 	u64 new;
@@ -1822,14 +1823,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
 }
 
 static ssize_t
-show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
 {
 	strcpy(buf, mce_helper);
 	strcat(buf, "\n");
 	return strlen(mce_helper) + 1;
 }
 
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
 				const char *buf, size_t siz)
 {
 	char *p;
@@ -1844,8 +1845,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 	return strlen(mce_helper) + !!p;
 }
 
-static ssize_t set_ignore_ce(struct sys_device *s,
-			     struct sysdev_attribute *attr,
+static ssize_t set_ignore_ce(struct device *s,
+			     struct device_attribute *attr,
 			     const char *buf, size_t size)
 {
 	u64 new;
@@ -1868,8 +1869,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,
 	return size;
 }
 
-static ssize_t set_cmci_disabled(struct sys_device *s,
-				 struct sysdev_attribute *attr,
+static ssize_t set_cmci_disabled(struct device *s,
+				 struct device_attribute *attr,
 				 const char *buf, size_t size)
 {
 	u64 new;
@@ -1891,108 +1892,107 @@ static ssize_t set_cmci_disabled(struct sys_device *s,
 	return size;
 }
 
-static ssize_t store_int_with_restart(struct sys_device *s,
-				      struct sysdev_attribute *attr,
+static ssize_t store_int_with_restart(struct device *s,
+				      struct device_attribute *attr,
 				      const char *buf, size_t size)
 {
-	ssize_t ret = sysdev_store_int(s, attr, buf, size);
+	ssize_t ret = device_store_int(s, attr, buf, size);
 	mce_restart();
 	return ret;
 }
 
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
+static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
 
-static struct sysdev_ext_attribute attr_check_interval = {
-	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
-		     store_int_with_restart),
+static struct dev_ext_attribute dev_attr_check_interval = {
+	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
 	&check_interval
 };
 
-static struct sysdev_ext_attribute attr_ignore_ce = {
-	_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+	__ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
 	&mce_ignore_ce
 };
 
-static struct sysdev_ext_attribute attr_cmci_disabled = {
-	_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+	__ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
 	&mce_cmci_disabled
 };
 
-static struct sysdev_attribute *mce_sysdev_attrs[] = {
-	&attr_tolerant.attr,
-	&attr_check_interval.attr,
-	&attr_trigger,
-	&attr_monarch_timeout.attr,
-	&attr_dont_log_ce.attr,
-	&attr_ignore_ce.attr,
-	&attr_cmci_disabled.attr,
+static struct device_attribute *mce_device_attrs[] = {
+	&dev_attr_tolerant.attr,
+	&dev_attr_check_interval.attr,
+	&dev_attr_trigger,
+	&dev_attr_monarch_timeout.attr,
+	&dev_attr_dont_log_ce.attr,
+	&dev_attr_ignore_ce.attr,
+	&dev_attr_cmci_disabled.attr,
 	NULL
 };
 
-static cpumask_var_t mce_sysdev_initialized;
+static cpumask_var_t mce_device_initialized;
 
-/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_sysdev_create(unsigned int cpu)
+/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+static __cpuinit int mce_device_create(unsigned int cpu)
 {
-	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+	struct device *dev = &per_cpu(mce_device, cpu);
 	int err;
 	int i, j;
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(&sysdev->kobj, 0, sizeof(struct kobject));
-	sysdev->id  = cpu;
-	sysdev->cls = &mce_sysdev_class;
+	memset(&dev->kobj, 0, sizeof(struct kobject));
+	dev->id  = cpu;
+	dev->bus = &mce_subsys;
 
-	err = sysdev_register(sysdev);
+	err = device_register(dev);
 	if (err)
 		return err;
 
-	for (i = 0; mce_sysdev_attrs[i]; i++) {
-		err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
+	for (i = 0; mce_device_attrs[i]; i++) {
+		err = device_create_file(dev, mce_device_attrs[i]);
 		if (err)
 			goto error;
 	}
 	for (j = 0; j < banks; j++) {
-		err = sysdev_create_file(sysdev, &mce_banks[j].attr);
+		err = device_create_file(dev, &mce_banks[j].attr);
 		if (err)
 			goto error2;
 	}
-	cpumask_set_cpu(cpu, mce_sysdev_initialized);
+	cpumask_set_cpu(cpu, mce_device_initialized);
 
 	return 0;
 error2:
 	while (--j >= 0)
-		sysdev_remove_file(sysdev, &mce_banks[j].attr);
+		device_remove_file(dev, &mce_banks[j].attr);
 error:
 	while (--i >= 0)
-		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+		device_remove_file(dev, mce_device_attrs[i]);
 
-	sysdev_unregister(sysdev);
+	device_unregister(dev);
 
 	return err;
 }
 
-static __cpuinit void mce_sysdev_remove(unsigned int cpu)
+static __cpuinit void mce_device_remove(unsigned int cpu)
 {
-	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+	struct device *dev = &per_cpu(mce_device, cpu);
 	int i;
 
-	if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
+	if (!cpumask_test_cpu(cpu, mce_device_initialized))
 		return;
 
-	for (i = 0; mce_sysdev_attrs[i]; i++)
-		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+	for (i = 0; mce_device_attrs[i]; i++)
+		device_remove_file(dev, mce_device_attrs[i]);
 
 	for (i = 0; i < banks; i++)
-		sysdev_remove_file(sysdev, &mce_banks[i].attr);
+		device_remove_file(dev, &mce_banks[i].attr);
 
-	sysdev_unregister(sysdev);
-	cpumask_clear_cpu(cpu, mce_sysdev_initialized);
+	device_unregister(dev);
+	cpumask_clear_cpu(cpu, mce_device_initialized);
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
@@ -2042,7 +2042,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		mce_sysdev_create(cpu);
+		mce_device_create(cpu);
 		if (threshold_cpu_callback)
 			threshold_cpu_callback(action, cpu);
 		break;
@@ -2050,7 +2050,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DEAD_FROZEN:
 		if (threshold_cpu_callback)
 			threshold_cpu_callback(action, cpu);
-		mce_sysdev_remove(cpu);
+		mce_device_remove(cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
@@ -2084,7 +2084,7 @@ static __init void mce_init_banks(void)
 
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
-		struct sysdev_attribute *a = &b->attr;
+		struct device_attribute *a = &b->attr;
 
 		sysfs_attr_init(&a->attr);
 		a->attr.name	= b->attrname;
@@ -2104,16 +2104,16 @@ static __init int mcheck_init_device(void)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
+	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
 
 	mce_init_banks();
 
-	err = sysdev_class_register(&mce_sysdev_class);
+	err = subsys_system_register(&mce_subsys, NULL);
 	if (err)
 		return err;
 
 	for_each_online_cpu(i) {
-		err = mce_sysdev_create(i);
+		err = mce_device_create(i);
 		if (err)
 			return err;
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f5474218cffe..56d2aa1acd55 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -17,7 +17,6 @@
 #include <linux/notifier.h>
 #include <linux/kobject.h>
 #include <linux/percpu.h>
-#include <linux/sysdev.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sysfs.h>
@@ -548,7 +547,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (!b)
 			goto out;
 
-		err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
+		err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -571,7 +570,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out;
 	}
 
-	b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
+	b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj);
 	if (!b->kobj)
 		goto out_free;
 
@@ -591,7 +590,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (i == cpu)
 			continue;
 
-		err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
+		err = sysfs_create_link(&per_cpu(mce_device, i).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -669,7 +668,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 
 		return;
@@ -681,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		if (i == cpu)
 			continue;
 
-		sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_device, i).kobj, name);
 		per_cpu(threshold_banks, i)[bank] = NULL;
 	}
 
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 787e06c84ea6..59e3f6ed265f 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -19,7 +19,6 @@
 #include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/export.h>
-#include <linux/sysdev.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/smp.h>
@@ -69,16 +68,16 @@ static atomic_t therm_throt_en	= ATOMIC_INIT(0);
 static u32 lvtthmr_init __read_mostly;
 
 #ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name)				\
-	static SYSDEV_ATTR(_name, 0444,					\
-			   therm_throt_sysdev_show_##_name,		\
+#define define_therm_throt_device_one_ro(_name)				\
+	static DEVICE_ATTR(_name, 0444,					\
+			   therm_throt_device_show_##_name,		\
 				   NULL)				\
 
-#define define_therm_throt_sysdev_show_func(event, name)		\
+#define define_therm_throt_device_show_func(event, name)		\
 									\
-static ssize_t therm_throt_sysdev_show_##event##_##name(		\
-			struct sys_device *dev,				\
-			struct sysdev_attribute *attr,			\
+static ssize_t therm_throt_device_show_##event##_##name(		\
+			struct device *dev,				\
+			struct device_attribute *attr,			\
 			char *buf)					\
 {									\
 	unsigned int cpu = dev->id;					\
@@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name(		\
 	return ret;							\
 }
 
-define_therm_throt_sysdev_show_func(core_throttle, count);
-define_therm_throt_sysdev_one_ro(core_throttle_count);
+define_therm_throt_device_show_func(core_throttle, count);
+define_therm_throt_device_one_ro(core_throttle_count);
 
-define_therm_throt_sysdev_show_func(core_power_limit, count);
-define_therm_throt_sysdev_one_ro(core_power_limit_count);
+define_therm_throt_device_show_func(core_power_limit, count);
+define_therm_throt_device_one_ro(core_power_limit_count);
 
-define_therm_throt_sysdev_show_func(package_throttle, count);
-define_therm_throt_sysdev_one_ro(package_throttle_count);
+define_therm_throt_device_show_func(package_throttle, count);
+define_therm_throt_device_one_ro(package_throttle_count);
 
-define_therm_throt_sysdev_show_func(package_power_limit, count);
-define_therm_throt_sysdev_one_ro(package_power_limit_count);
+define_therm_throt_device_show_func(package_power_limit, count);
+define_therm_throt_device_one_ro(package_power_limit_count);
 
 static struct attribute *thermal_throttle_attrs[] = {
-	&attr_core_throttle_count.attr,
+	&dev_attr_core_throttle_count.attr,
 	NULL
 };
 
@@ -223,36 +222,36 @@ static int thresh_event_valid(int event)
 
 #ifdef CONFIG_SYSFS
 /* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
+static __cpuinit int thermal_throttle_add_dev(struct device *dev,
 				unsigned int cpu)
 {
 	int err;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+	err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
 	if (err)
 		return err;
 
 	if (cpu_has(c, X86_FEATURE_PLN))
-		err = sysfs_add_file_to_group(&sys_dev->kobj,
-					      &attr_core_power_limit_count.attr,
+		err = sysfs_add_file_to_group(&dev->kobj,
+					      &dev_attr_core_power_limit_count.attr,
 					      thermal_attr_group.name);
 	if (cpu_has(c, X86_FEATURE_PTS)) {
-		err = sysfs_add_file_to_group(&sys_dev->kobj,
-					      &attr_package_throttle_count.attr,
+		err = sysfs_add_file_to_group(&dev->kobj,
+					      &dev_attr_package_throttle_count.attr,
 					      thermal_attr_group.name);
 		if (cpu_has(c, X86_FEATURE_PLN))
-			err = sysfs_add_file_to_group(&sys_dev->kobj,
-					&attr_package_power_limit_count.attr,
+			err = sysfs_add_file_to_group(&dev->kobj,
+					&dev_attr_package_power_limit_count.attr,
 					thermal_attr_group.name);
 	}
 
 	return err;
 }
 
-static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
+static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
 {
-	sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
+	sysfs_remove_group(&dev->kobj, &thermal_attr_group);
 }
 
 /* Mutex protecting device creation against CPU hotplug: */
@@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
 			      void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 	int err = 0;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	dev = get_cpu_device(cpu);
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		mutex_lock(&therm_cpu_lock);
-		err = thermal_throttle_add_dev(sys_dev, cpu);
+		err = thermal_throttle_add_dev(dev, cpu);
 		mutex_unlock(&therm_cpu_lock);
 		WARN_ON(err);
 		break;
@@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		mutex_lock(&therm_cpu_lock);
-		thermal_throttle_remove_dev(sys_dev);
+		thermal_throttle_remove_dev(dev);
 		mutex_unlock(&therm_cpu_lock);
 		break;
 	}
@@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)
 #endif
 	/* connect live CPUs to sysfs */
 	for_each_online_cpu(cpu) {
-		err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
+		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
 		WARN_ON(err);
 	}
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index f2d2a664e797..cf88f2a16473 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)
 	return err;
 }
 
-static ssize_t reload_store(struct sys_device *dev,
-			    struct sysdev_attribute *attr,
+static ssize_t reload_store(struct device *dev,
+			    struct device_attribute *attr,
 			    const char *buf, size_t size)
 {
 	unsigned long val;
@@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,
 	return ret;
 }
 
-static ssize_t version_show(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t version_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 
 	return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
 }
 
-static ssize_t pf_show(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t pf_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 
 	return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
 }
 
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+static DEVICE_ATTR(reload, 0200, NULL, reload_store);
+static DEVICE_ATTR(version, 0400, version_show, NULL);
+static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
 
 static struct attribute *mc_default_attrs[] = {
-	&attr_reload.attr,
-	&attr_version.attr,
-	&attr_processor_flags.attr,
+	&dev_attr_reload.attr,
+	&dev_attr_version.attr,
+	&dev_attr_processor_flags.attr,
 	NULL
 };
 
@@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)
 	return ustate;
 }
 
-static int mc_sysdev_add(struct sys_device *sys_dev)
+static int mc_device_add(struct device *dev, struct subsys_interface *sif)
 {
-	int err, cpu = sys_dev->id;
+	int err, cpu = dev->id;
 
 	if (!cpu_online(cpu))
 		return 0;
 
 	pr_debug("CPU%d added\n", cpu);
 
-	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+	err = sysfs_create_group(&dev->kobj, &mc_attr_group);
 	if (err)
 		return err;
 
 	if (microcode_init_cpu(cpu) == UCODE_ERROR) {
-		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+		sysfs_remove_group(&dev->kobj, &mc_attr_group);
 		return -EINVAL;
 	}
 
 	return err;
 }
 
-static int mc_sysdev_remove(struct sys_device *sys_dev)
+static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
 {
-	int cpu = sys_dev->id;
+	int cpu = dev->id;
 
 	if (!cpu_online(cpu))
 		return 0;
 
 	pr_debug("CPU%d removed\n", cpu);
 	microcode_fini_cpu(cpu);
-	sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+	sysfs_remove_group(&dev->kobj, &mc_attr_group);
 	return 0;
 }
 
-static struct sysdev_driver mc_sysdev_driver = {
-	.add			= mc_sysdev_add,
-	.remove			= mc_sysdev_remove,
+static struct subsys_interface mc_cpu_interface = {
+	.name			= "microcode",
+	.subsys			= &cpu_subsys,
+	.add_dev		= mc_device_add,
+	.remove_dev		= mc_device_remove,
 };
 
 /**
@@ -464,9 +466,9 @@ static __cpuinit int
 mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	dev = get_cpu_device(cpu);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
@@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		pr_debug("CPU%d added\n", cpu);
-		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+		if (sysfs_create_group(&dev->kobj, &mc_attr_group))
 			pr_err("Failed to create group for CPU%d\n", cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		/* Suspend is in progress, only remove the interface */
-		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+		sysfs_remove_group(&dev->kobj, &mc_attr_group);
 		pr_debug("CPU%d removed\n", cpu);
 		break;
 
@@ -527,7 +529,7 @@ static int __init microcode_init(void)
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
-	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+	error = subsys_interface_register(&mc_cpu_interface);
 
 	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
@@ -561,7 +563,7 @@ static void __exit microcode_exit(void)
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
-	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+	subsys_interface_unregister(&mc_cpu_interface);
 
 	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 9d7bc9f6b6cc..20a68ca386de 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -446,7 +446,7 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
 {
 	struct acpi_processor *pr = NULL;
 	int result = 0;
-	struct sys_device *sysdev;
+	struct device *dev;
 
 	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
 	if (!pr)
@@ -491,8 +491,8 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
 
 	per_cpu(processors, pr->id) = pr;
 
-	sysdev = get_cpu_sysdev(pr->id);
-	if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
+	dev = get_cpu_device(pr->id);
+	if (sysfs_create_link(&device->dev.kobj, &dev->kobj, "sysdev")) {
 		result = -EFAULT;
 		goto err_free_cpumask;
 	}
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index 870550d6a4bf..3b599abf2b40 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -30,7 +30,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/cpufreq.h>
-#include <linux/sysdev.h>
 
 #include <asm/uaccess.h>
 
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 251acea3d359..5bb0298fbcc0 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -1,8 +1,7 @@
 /*
- * drivers/base/cpu.c - basic CPU class support
+ * CPU subsystem support
  */
 
-#include <linux/sysdev.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
@@ -14,40 +13,40 @@
 
 #include "base.h"
 
-static struct sysdev_class_attribute *cpu_sysdev_class_attrs[];
-
-struct sysdev_class cpu_sysdev_class = {
+struct bus_type cpu_subsys = {
 	.name = "cpu",
-	.attrs = cpu_sysdev_class_attrs,
+	.dev_name = "cpu",
 };
-EXPORT_SYMBOL(cpu_sysdev_class);
+EXPORT_SYMBOL_GPL(cpu_subsys);
 
-static DEFINE_PER_CPU(struct sys_device *, cpu_sys_devices);
+static DEFINE_PER_CPU(struct device *, cpu_sys_devices);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static ssize_t show_online(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t show_online(struct device *dev,
+			   struct device_attribute *attr,
 			   char *buf)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 
-	return sprintf(buf, "%u\n", !!cpu_online(cpu->sysdev.id));
+	return sprintf(buf, "%u\n", !!cpu_online(cpu->dev.id));
 }
 
-static ssize_t __ref store_online(struct sys_device *dev, struct sysdev_attribute *attr,
-				 const char *buf, size_t count)
+static ssize_t __ref store_online(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 	ssize_t ret;
 
 	cpu_hotplug_driver_lock();
 	switch (buf[0]) {
 	case '0':
-		ret = cpu_down(cpu->sysdev.id);
+		ret = cpu_down(cpu->dev.id);
 		if (!ret)
 			kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
 		break;
 	case '1':
-		ret = cpu_up(cpu->sysdev.id);
+		ret = cpu_up(cpu->dev.id);
 		if (!ret)
 			kobject_uevent(&dev->kobj, KOBJ_ONLINE);
 		break;
@@ -60,44 +59,44 @@ static ssize_t __ref store_online(struct sys_device *dev, struct sysdev_attribut
 		ret = count;
 	return ret;
 }
-static SYSDEV_ATTR(online, 0644, show_online, store_online);
+static DEVICE_ATTR(online, 0644, show_online, store_online);
 
 static void __cpuinit register_cpu_control(struct cpu *cpu)
 {
-	sysdev_create_file(&cpu->sysdev, &attr_online);
+	device_create_file(&cpu->dev, &dev_attr_online);
 }
 void unregister_cpu(struct cpu *cpu)
 {
-	int logical_cpu = cpu->sysdev.id;
+	int logical_cpu = cpu->dev.id;
 
 	unregister_cpu_under_node(logical_cpu, cpu_to_node(logical_cpu));
 
-	sysdev_remove_file(&cpu->sysdev, &attr_online);
+	device_remove_file(&cpu->dev, &dev_attr_online);
 
-	sysdev_unregister(&cpu->sysdev);
+	device_unregister(&cpu->dev);
 	per_cpu(cpu_sys_devices, logical_cpu) = NULL;
 	return;
 }
 
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
-static ssize_t cpu_probe_store(struct sysdev_class *class,
-			       struct sysdev_class_attribute *attr,
+static ssize_t cpu_probe_store(struct device *dev,
+			       struct device_attribute *attr,
 			       const char *buf,
 			       size_t count)
 {
 	return arch_cpu_probe(buf, count);
 }
 
-static ssize_t cpu_release_store(struct sysdev_class *class,
-				 struct sysdev_class_attribute *attr,
+static ssize_t cpu_release_store(struct device *dev,
+				 struct device_attribute *attr,
 				 const char *buf,
 				 size_t count)
 {
 	return arch_cpu_release(buf, count);
 }
 
-static SYSDEV_CLASS_ATTR(probe, S_IWUSR, NULL, cpu_probe_store);
-static SYSDEV_CLASS_ATTR(release, S_IWUSR, NULL, cpu_release_store);
+static DEVICE_ATTR(probe, S_IWUSR, NULL, cpu_probe_store);
+static DEVICE_ATTR(release, S_IWUSR, NULL, cpu_release_store);
 #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
 
 #else /* ... !CONFIG_HOTPLUG_CPU */
@@ -109,15 +108,15 @@ static inline void register_cpu_control(struct cpu *cpu)
 #ifdef CONFIG_KEXEC
 #include <linux/kexec.h>
 
-static ssize_t show_crash_notes(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t show_crash_notes(struct device *dev, struct device_attribute *attr,
 				char *buf)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 	ssize_t rc;
 	unsigned long long addr;
 	int cpunum;
 
-	cpunum = cpu->sysdev.id;
+	cpunum = cpu->dev.id;
 
 	/*
 	 * Might be reading other cpu's data based on which cpu read thread
@@ -129,7 +128,7 @@ static ssize_t show_crash_notes(struct sys_device *dev, struct sysdev_attribute
 	rc = sprintf(buf, "%Lx\n", addr);
 	return rc;
 }
-static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
+static DEVICE_ATTR(crash_notes, 0400, show_crash_notes, NULL);
 #endif
 
 /*
@@ -137,12 +136,12 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
  */
 
 struct cpu_attr {
-	struct sysdev_class_attribute attr;
+	struct device_attribute attr;
 	const struct cpumask *const * const map;
 };
 
-static ssize_t show_cpus_attr(struct sysdev_class *class,
-			      struct sysdev_class_attribute *attr,
+static ssize_t show_cpus_attr(struct device *dev,
+			      struct device_attribute *attr,
 			      char *buf)
 {
 	struct cpu_attr *ca = container_of(attr, struct cpu_attr, attr);
@@ -153,10 +152,10 @@ static ssize_t show_cpus_attr(struct sysdev_class *class,
 	return n;
 }
 
-#define _CPU_ATTR(name, map)						\
-	{ _SYSDEV_CLASS_ATTR(name, 0444, show_cpus_attr, NULL), map }
+#define _CPU_ATTR(name, map) \
+	{ __ATTR(name, 0444, show_cpus_attr, NULL), map }
 
-/* Keep in sync with cpu_sysdev_class_attrs */
+/* Keep in sync with cpu_subsys_attrs */
 static struct cpu_attr cpu_attrs[] = {
 	_CPU_ATTR(online, &cpu_online_mask),
 	_CPU_ATTR(possible, &cpu_possible_mask),
@@ -166,19 +165,19 @@ static struct cpu_attr cpu_attrs[] = {
 /*
  * Print values for NR_CPUS and offlined cpus
  */
-static ssize_t print_cpus_kernel_max(struct sysdev_class *class,
-				     struct sysdev_class_attribute *attr, char *buf)
+static ssize_t print_cpus_kernel_max(struct device *dev,
+				     struct device_attribute *attr, char *buf)
 {
 	int n = snprintf(buf, PAGE_SIZE-2, "%d\n", NR_CPUS - 1);
 	return n;
 }
-static SYSDEV_CLASS_ATTR(kernel_max, 0444, print_cpus_kernel_max, NULL);
+static DEVICE_ATTR(kernel_max, 0444, print_cpus_kernel_max, NULL);
 
 /* arch-optional setting to enable display of offline cpus >= nr_cpu_ids */
 unsigned int total_cpus;
 
-static ssize_t print_cpus_offline(struct sysdev_class *class,
-				  struct sysdev_class_attribute *attr, char *buf)
+static ssize_t print_cpus_offline(struct device *dev,
+				  struct device_attribute *attr, char *buf)
 {
 	int n = 0, len = PAGE_SIZE-2;
 	cpumask_var_t offline;
@@ -205,7 +204,7 @@ static ssize_t print_cpus_offline(struct sysdev_class *class,
 	n += snprintf(&buf[n], len - n, "\n");
 	return n;
 }
-static SYSDEV_CLASS_ATTR(offline, 0444, print_cpus_offline, NULL);
+static DEVICE_ATTR(offline, 0444, print_cpus_offline, NULL);
 
 /*
  * register_cpu - Setup a sysfs device for a CPU.
@@ -218,57 +217,66 @@ static SYSDEV_CLASS_ATTR(offline, 0444, print_cpus_offline, NULL);
 int __cpuinit register_cpu(struct cpu *cpu, int num)
 {
 	int error;
-	cpu->node_id = cpu_to_node(num);
-	cpu->sysdev.id = num;
-	cpu->sysdev.cls = &cpu_sysdev_class;
-
-	error = sysdev_register(&cpu->sysdev);
 
+	cpu->node_id = cpu_to_node(num);
+	cpu->dev.id = num;
+	cpu->dev.bus = &cpu_subsys;
+	error = device_register(&cpu->dev);
 	if (!error && cpu->hotpluggable)
 		register_cpu_control(cpu);
 	if (!error)
-		per_cpu(cpu_sys_devices, num) = &cpu->sysdev;
+		per_cpu(cpu_sys_devices, num) = &cpu->dev;
 	if (!error)
 		register_cpu_under_node(num, cpu_to_node(num));
 
 #ifdef CONFIG_KEXEC
 	if (!error)
-		error = sysdev_create_file(&cpu->sysdev, &attr_crash_notes);
+		error = device_create_file(&cpu->dev, &dev_attr_crash_notes);
 #endif
 	return error;
 }
 
-struct sys_device *get_cpu_sysdev(unsigned cpu)
+struct device *get_cpu_device(unsigned cpu)
 {
 	if (cpu < nr_cpu_ids && cpu_possible(cpu))
 		return per_cpu(cpu_sys_devices, cpu);
 	else
 		return NULL;
 }
-EXPORT_SYMBOL_GPL(get_cpu_sysdev);
+EXPORT_SYMBOL_GPL(get_cpu_device);
+
+static struct attribute *cpu_root_attrs[] = {
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	&dev_attr_probe.attr,
+	&dev_attr_release.attr,
+#endif
+	&cpu_attrs[0].attr.attr,
+	&cpu_attrs[1].attr.attr,
+	&cpu_attrs[2].attr.attr,
+	&dev_attr_kernel_max.attr,
+	&dev_attr_offline.attr,
+	NULL
+};
+
+static struct attribute_group cpu_root_attr_group = {
+	.attrs = cpu_root_attrs,
+};
+
+static const struct attribute_group *cpu_root_attr_groups[] = {
+	&cpu_root_attr_group,
+	NULL,
+};
 
 int __init cpu_dev_init(void)
 {
 	int err;
 
-	err = sysdev_class_register(&cpu_sysdev_class);
+	err = subsys_system_register(&cpu_subsys, cpu_root_attr_groups);
+	if (err)
+		return err;
+
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	if (!err)
-		err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class);
+	err = sched_create_sysfs_power_savings_entries(cpu_subsys.dev_root);
 #endif
-
 	return err;
 }
-
-static struct sysdev_class_attribute *cpu_sysdev_class_attrs[] = {
-#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
-	&attr_probe,
-	&attr_release,
-#endif
-	&cpu_attrs[0].attr,
-	&cpu_attrs[1].attr,
-	&cpu_attrs[2].attr,
-	&attr_kernel_max,
-	&attr_offline,
-	NULL
-};
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 793f796c4da3..6ce1501c7de5 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -315,12 +315,12 @@ struct node node_devices[MAX_NUMNODES];
 int register_cpu_under_node(unsigned int cpu, unsigned int nid)
 {
 	int ret;
-	struct sys_device *obj;
+	struct device *obj;
 
 	if (!node_online(nid))
 		return 0;
 
-	obj = get_cpu_sysdev(cpu);
+	obj = get_cpu_device(cpu);
 	if (!obj)
 		return 0;
 
@@ -337,12 +337,12 @@ int register_cpu_under_node(unsigned int cpu, unsigned int nid)
 
 int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 {
-	struct sys_device *obj;
+	struct device *obj;
 
 	if (!node_online(nid))
 		return 0;
 
-	obj = get_cpu_sysdev(cpu);
+	obj = get_cpu_device(cpu);
 	if (!obj)
 		return 0;
 
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index f6f37a05a0c3..ae989c57cd5e 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -23,7 +23,6 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  */
-#include <linux/sysdev.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/cpu.h>
@@ -32,14 +31,14 @@
 #include <linux/topology.h>
 
 #define define_one_ro_named(_name, _func)				\
-static SYSDEV_ATTR(_name, 0444, _func, NULL)
+	static DEVICE_ATTR(_name, 0444, _func, NULL)
 
 #define define_one_ro(_name)				\
-static SYSDEV_ATTR(_name, 0444, show_##_name, NULL)
+	static DEVICE_ATTR(_name, 0444, show_##_name, NULL)
 
 #define define_id_show_func(name)				\
-static ssize_t show_##name(struct sys_device *dev,		\
-		struct sysdev_attribute *attr, char *buf)	\
+static ssize_t show_##name(struct device *dev,			\
+		struct device_attribute *attr, char *buf)	\
 {								\
 	unsigned int cpu = dev->id;				\
 	return sprintf(buf, "%d\n", topology_##name(cpu));	\
@@ -65,16 +64,16 @@ static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
 
 #ifdef arch_provides_topology_pointers
 #define define_siblings_show_map(name)					\
-static ssize_t show_##name(struct sys_device *dev,			\
-			   struct sysdev_attribute *attr, char *buf)	\
+static ssize_t show_##name(struct device *dev,				\
+			   struct device_attribute *attr, char *buf)	\
 {									\
 	unsigned int cpu = dev->id;					\
 	return show_cpumap(0, topology_##name(cpu), buf);		\
 }
 
 #define define_siblings_show_list(name)					\
-static ssize_t show_##name##_list(struct sys_device *dev,		\
-				  struct sysdev_attribute *attr,	\
+static ssize_t show_##name##_list(struct device *dev,			\
+				  struct device_attribute *attr,	\
 				  char *buf)				\
 {									\
 	unsigned int cpu = dev->id;					\
@@ -83,15 +82,15 @@ static ssize_t show_##name##_list(struct sys_device *dev,		\
 
 #else
 #define define_siblings_show_map(name)					\
-static ssize_t show_##name(struct sys_device *dev,			\
-			   struct sysdev_attribute *attr, char *buf)	\
+static ssize_t show_##name(struct device *dev,				\
+			   struct device_attribute *attr, char *buf)	\
 {									\
 	return show_cpumap(0, topology_##name(dev->id), buf);		\
 }
 
 #define define_siblings_show_list(name)					\
-static ssize_t show_##name##_list(struct sys_device *dev,		\
-				  struct sysdev_attribute *attr,	\
+static ssize_t show_##name##_list(struct device *dev,			\
+				  struct device_attribute *attr,	\
 				  char *buf)				\
 {									\
 	return show_cpumap(1, topology_##name(dev->id), buf);		\
@@ -124,16 +123,16 @@ define_one_ro_named(book_siblings_list, show_book_cpumask_list);
 #endif
 
 static struct attribute *default_attrs[] = {
-	&attr_physical_package_id.attr,
-	&attr_core_id.attr,
-	&attr_thread_siblings.attr,
-	&attr_thread_siblings_list.attr,
-	&attr_core_siblings.attr,
-	&attr_core_siblings_list.attr,
+	&dev_attr_physical_package_id.attr,
+	&dev_attr_core_id.attr,
+	&dev_attr_thread_siblings.attr,
+	&dev_attr_thread_siblings_list.attr,
+	&dev_attr_core_siblings.attr,
+	&dev_attr_core_siblings_list.attr,
 #ifdef CONFIG_SCHED_BOOK
-	&attr_book_id.attr,
-	&attr_book_siblings.attr,
-	&attr_book_siblings_list.attr,
+	&dev_attr_book_id.attr,
+	&dev_attr_book_siblings.attr,
+	&dev_attr_book_siblings_list.attr,
 #endif
 	NULL
 };
@@ -146,16 +145,16 @@ static struct attribute_group topology_attr_group = {
 /* Add/Remove cpu_topology interface for CPU device */
 static int __cpuinit topology_add_dev(unsigned int cpu)
 {
-	struct sys_device *sys_dev = get_cpu_sysdev(cpu);
+	struct device *dev = get_cpu_device(cpu);
 
-	return sysfs_create_group(&sys_dev->kobj, &topology_attr_group);
+	return sysfs_create_group(&dev->kobj, &topology_attr_group);
 }
 
 static void __cpuinit topology_remove_dev(unsigned int cpu)
 {
-	struct sys_device *sys_dev = get_cpu_sysdev(cpu);
+	struct device *dev = get_cpu_device(cpu);
 
-	sysfs_remove_group(&sys_dev->kobj, &topology_attr_group);
+	sysfs_remove_group(&dev->kobj, &topology_attr_group);
 }
 
 static int __cpuinit topology_cpu_callback(struct notifier_block *nfb,
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 987a165ede26..8c2df3499da7 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -679,7 +679,7 @@ static struct kobj_type ktype_cpufreq = {
  */
 static int cpufreq_add_dev_policy(unsigned int cpu,
 				  struct cpufreq_policy *policy,
-				  struct sys_device *sys_dev)
+				  struct device *dev)
 {
 	int ret = 0;
 #ifdef CONFIG_SMP
@@ -728,7 +728,7 @@ static int cpufreq_add_dev_policy(unsigned int cpu,
 			spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
 			pr_debug("CPU already managed, adding link\n");
-			ret = sysfs_create_link(&sys_dev->kobj,
+			ret = sysfs_create_link(&dev->kobj,
 						&managed_policy->kobj,
 						"cpufreq");
 			if (ret)
@@ -761,7 +761,7 @@ static int cpufreq_add_dev_symlink(unsigned int cpu,
 
 	for_each_cpu(j, policy->cpus) {
 		struct cpufreq_policy *managed_policy;
-		struct sys_device *cpu_sys_dev;
+		struct device *cpu_dev;
 
 		if (j == cpu)
 			continue;
@@ -770,8 +770,8 @@ static int cpufreq_add_dev_symlink(unsigned int cpu,
 
 		pr_debug("CPU %u already managed, adding link\n", j);
 		managed_policy = cpufreq_cpu_get(cpu);
-		cpu_sys_dev = get_cpu_sysdev(j);
-		ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj,
+		cpu_dev = get_cpu_device(j);
+		ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
 					"cpufreq");
 		if (ret) {
 			cpufreq_cpu_put(managed_policy);
@@ -783,7 +783,7 @@ static int cpufreq_add_dev_symlink(unsigned int cpu,
 
 static int cpufreq_add_dev_interface(unsigned int cpu,
 				     struct cpufreq_policy *policy,
-				     struct sys_device *sys_dev)
+				     struct device *dev)
 {
 	struct cpufreq_policy new_policy;
 	struct freq_attr **drv_attr;
@@ -793,7 +793,7 @@ static int cpufreq_add_dev_interface(unsigned int cpu,
 
 	/* prepare interface data */
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
-				   &sys_dev->kobj, "cpufreq");
+				   &dev->kobj, "cpufreq");
 	if (ret)
 		return ret;
 
@@ -866,9 +866,9 @@ err_out_kobj_put:
  * with with cpu hotplugging and all hell will break loose. Tried to clean this
  * mess up, but more thorough testing is needed. - Mathieu
  */
-static int cpufreq_add_dev(struct sys_device *sys_dev)
+static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	int ret = 0, found = 0;
 	struct cpufreq_policy *policy;
 	unsigned long flags;
@@ -947,7 +947,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 				     CPUFREQ_START, policy);
 
-	ret = cpufreq_add_dev_policy(cpu, policy, sys_dev);
+	ret = cpufreq_add_dev_policy(cpu, policy, dev);
 	if (ret) {
 		if (ret > 0)
 			/* This is a managed cpu, symlink created,
@@ -956,7 +956,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 		goto err_unlock_policy;
 	}
 
-	ret = cpufreq_add_dev_interface(cpu, policy, sys_dev);
+	ret = cpufreq_add_dev_interface(cpu, policy, dev);
 	if (ret)
 		goto err_out_unregister;
 
@@ -999,15 +999,15 @@ module_out:
  * Caller should already have policy_rwsem in write mode for this CPU.
  * This routine frees the rwsem before returning.
  */
-static int __cpufreq_remove_dev(struct sys_device *sys_dev)
+static int __cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	unsigned long flags;
 	struct cpufreq_policy *data;
 	struct kobject *kobj;
 	struct completion *cmp;
 #ifdef CONFIG_SMP
-	struct sys_device *cpu_sys_dev;
+	struct device *cpu_dev;
 	unsigned int j;
 #endif
 
@@ -1032,7 +1032,7 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 		pr_debug("removing link\n");
 		cpumask_clear_cpu(cpu, data->cpus);
 		spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-		kobj = &sys_dev->kobj;
+		kobj = &dev->kobj;
 		cpufreq_cpu_put(data);
 		unlock_policy_rwsem_write(cpu);
 		sysfs_remove_link(kobj, "cpufreq");
@@ -1071,8 +1071,8 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 			strncpy(per_cpu(cpufreq_cpu_governor, j),
 				data->governor->name, CPUFREQ_NAME_LEN);
 #endif
-			cpu_sys_dev = get_cpu_sysdev(j);
-			kobj = &cpu_sys_dev->kobj;
+			cpu_dev = get_cpu_device(j);
+			kobj = &cpu_dev->kobj;
 			unlock_policy_rwsem_write(cpu);
 			sysfs_remove_link(kobj, "cpufreq");
 			lock_policy_rwsem_write(cpu);
@@ -1112,11 +1112,11 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	if (unlikely(cpumask_weight(data->cpus) > 1)) {
 		/* first sibling now owns the new sysfs dir */
 		cpumask_clear_cpu(cpu, data->cpus);
-		cpufreq_add_dev(get_cpu_sysdev(cpumask_first(data->cpus)));
+		cpufreq_add_dev(get_cpu_device(cpumask_first(data->cpus)), NULL);
 
 		/* finally remove our own symlink */
 		lock_policy_rwsem_write(cpu);
-		__cpufreq_remove_dev(sys_dev);
+		__cpufreq_remove_dev(dev, sif);
 	}
 #endif
 
@@ -1128,9 +1128,9 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 }
 
 
-static int cpufreq_remove_dev(struct sys_device *sys_dev)
+static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	int retval;
 
 	if (cpu_is_offline(cpu))
@@ -1139,7 +1139,7 @@ static int cpufreq_remove_dev(struct sys_device *sys_dev)
 	if (unlikely(lock_policy_rwsem_write(cpu)))
 		BUG();
 
-	retval = __cpufreq_remove_dev(sys_dev);
+	retval = __cpufreq_remove_dev(dev, sif);
 	return retval;
 }
 
@@ -1271,9 +1271,11 @@ out:
 }
 EXPORT_SYMBOL(cpufreq_get);
 
-static struct sysdev_driver cpufreq_sysdev_driver = {
-	.add		= cpufreq_add_dev,
-	.remove		= cpufreq_remove_dev,
+static struct subsys_interface cpufreq_interface = {
+	.name		= "cpufreq",
+	.subsys		= &cpu_subsys,
+	.add_dev	= cpufreq_add_dev,
+	.remove_dev	= cpufreq_remove_dev,
 };
 
 
@@ -1765,25 +1767,25 @@ static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
-	if (sys_dev) {
+	dev = get_cpu_device(cpu);
+	if (dev) {
 		switch (action) {
 		case CPU_ONLINE:
 		case CPU_ONLINE_FROZEN:
-			cpufreq_add_dev(sys_dev);
+			cpufreq_add_dev(dev, NULL);
 			break;
 		case CPU_DOWN_PREPARE:
 		case CPU_DOWN_PREPARE_FROZEN:
 			if (unlikely(lock_policy_rwsem_write(cpu)))
 				BUG();
 
-			__cpufreq_remove_dev(sys_dev);
+			__cpufreq_remove_dev(dev, NULL);
 			break;
 		case CPU_DOWN_FAILED:
 		case CPU_DOWN_FAILED_FROZEN:
-			cpufreq_add_dev(sys_dev);
+			cpufreq_add_dev(dev, NULL);
 			break;
 		}
 	}
@@ -1830,8 +1832,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	cpufreq_driver = driver_data;
 	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
-	ret = sysdev_driver_register(&cpu_sysdev_class,
-					&cpufreq_sysdev_driver);
+	ret = subsys_interface_register(&cpufreq_interface);
 	if (ret)
 		goto err_null_driver;
 
@@ -1850,7 +1851,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 		if (ret) {
 			pr_debug("no CPU initialized for driver %s\n",
 							driver_data->name);
-			goto err_sysdev_unreg;
+			goto err_if_unreg;
 		}
 	}
 
@@ -1858,9 +1859,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	pr_debug("driver %s up and running\n", driver_data->name);
 
 	return 0;
-err_sysdev_unreg:
-	sysdev_driver_unregister(&cpu_sysdev_class,
-			&cpufreq_sysdev_driver);
+err_if_unreg:
+	subsys_interface_unregister(&cpufreq_interface);
 err_null_driver:
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
 	cpufreq_driver = NULL;
@@ -1887,7 +1887,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 
 	pr_debug("unregistering driver %s\n", driver->name);
 
-	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
+	subsys_interface_unregister(&cpufreq_interface);
 	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
@@ -1907,8 +1907,7 @@ static int __init cpufreq_core_init(void)
 		init_rwsem(&per_cpu(cpu_policy_rwsem, cpu));
 	}
 
-	cpufreq_global_kobject = kobject_create_and_add("cpufreq",
-						&cpu_sysdev_class.kset.kobj);
+	cpufreq_global_kobject = kobject_create_and_add("cpufreq", &cpu_subsys.dev_root->kobj);
 	BUG_ON(!cpufreq_global_kobject);
 	register_syscore_ops(&cpufreq_syscore_ops);
 
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index c5072a91e848..390380a8cfc9 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -11,7 +11,6 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/sysdev.h>
 #include <linux/cpu.h>
 #include <linux/sysfs.h>
 #include <linux/cpufreq.h>
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 06ce2680d00d..59f4261c753a 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -291,10 +291,10 @@ EXPORT_SYMBOL_GPL(cpuidle_disable_device);
 static int __cpuidle_register_device(struct cpuidle_device *dev)
 {
 	int ret;
-	struct sys_device *sys_dev = get_cpu_sysdev((unsigned long)dev->cpu);
+	struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu);
 	struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver();
 
-	if (!sys_dev)
+	if (!dev)
 		return -EINVAL;
 	if (!try_module_get(cpuidle_driver->owner))
 		return -EINVAL;
@@ -303,7 +303,7 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
 
 	per_cpu(cpuidle_devices, dev->cpu) = dev;
 	list_add(&dev->device_list, &cpuidle_detected_devices);
-	if ((ret = cpuidle_add_sysfs(sys_dev))) {
+	if ((ret = cpuidle_add_sysfs(cpu_dev))) {
 		module_put(cpuidle_driver->owner);
 		return ret;
 	}
@@ -344,7 +344,7 @@ EXPORT_SYMBOL_GPL(cpuidle_register_device);
  */
 void cpuidle_unregister_device(struct cpuidle_device *dev)
 {
-	struct sys_device *sys_dev = get_cpu_sysdev((unsigned long)dev->cpu);
+	struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu);
 	struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver();
 
 	if (dev->registered == 0)
@@ -354,7 +354,7 @@ void cpuidle_unregister_device(struct cpuidle_device *dev)
 
 	cpuidle_disable_device(dev);
 
-	cpuidle_remove_sysfs(sys_dev);
+	cpuidle_remove_sysfs(cpu_dev);
 	list_del(&dev->device_list);
 	wait_for_completion(&dev->kobj_unregister);
 	per_cpu(cpuidle_devices, dev->cpu) = NULL;
@@ -411,7 +411,7 @@ static int __init cpuidle_init(void)
 	if (cpuidle_disabled())
 		return -ENODEV;
 
-	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
+	ret = cpuidle_add_interface(cpu_subsys.dev_root);
 	if (ret)
 		return ret;
 
diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
index 38c3fd8b9d76..7db186685c27 100644
--- a/drivers/cpuidle/cpuidle.h
+++ b/drivers/cpuidle/cpuidle.h
@@ -5,7 +5,7 @@
 #ifndef __DRIVER_CPUIDLE_H
 #define __DRIVER_CPUIDLE_H
 
-#include <linux/sysdev.h>
+#include <linux/device.h>
 
 /* For internal use only */
 extern struct cpuidle_governor *cpuidle_curr_governor;
@@ -23,11 +23,11 @@ extern void cpuidle_uninstall_idle_handler(void);
 extern int cpuidle_switch_governor(struct cpuidle_governor *gov);
 
 /* sysfs */
-extern int cpuidle_add_class_sysfs(struct sysdev_class *cls);
-extern void cpuidle_remove_class_sysfs(struct sysdev_class *cls);
+extern int cpuidle_add_interface(struct device *dev);
+extern void cpuidle_remove_interface(struct device *dev);
 extern int cpuidle_add_state_sysfs(struct cpuidle_device *device);
 extern void cpuidle_remove_state_sysfs(struct cpuidle_device *device);
-extern int cpuidle_add_sysfs(struct sys_device *sysdev);
-extern void cpuidle_remove_sysfs(struct sys_device *sysdev);
+extern int cpuidle_add_sysfs(struct device *dev);
+extern void cpuidle_remove_sysfs(struct device *dev);
 
 #endif /* __DRIVER_CPUIDLE_H */
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 1e756e160dca..3fe41fe4851a 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -22,8 +22,8 @@ static int __init cpuidle_sysfs_setup(char *unused)
 }
 __setup("cpuidle_sysfs_switch", cpuidle_sysfs_setup);
 
-static ssize_t show_available_governors(struct sysdev_class *class,
-					struct sysdev_class_attribute *attr,
+static ssize_t show_available_governors(struct device *dev,
+					struct device_attribute *attr,
 					char *buf)
 {
 	ssize_t i = 0;
@@ -42,8 +42,8 @@ out:
 	return i;
 }
 
-static ssize_t show_current_driver(struct sysdev_class *class,
-				   struct sysdev_class_attribute *attr,
+static ssize_t show_current_driver(struct device *dev,
+				   struct device_attribute *attr,
 				   char *buf)
 {
 	ssize_t ret;
@@ -59,8 +59,8 @@ static ssize_t show_current_driver(struct sysdev_class *class,
 	return ret;
 }
 
-static ssize_t show_current_governor(struct sysdev_class *class,
-				     struct sysdev_class_attribute *attr,
+static ssize_t show_current_governor(struct device *dev,
+				     struct device_attribute *attr,
 				     char *buf)
 {
 	ssize_t ret;
@@ -75,8 +75,8 @@ static ssize_t show_current_governor(struct sysdev_class *class,
 	return ret;
 }
 
-static ssize_t store_current_governor(struct sysdev_class *class,
-				      struct sysdev_class_attribute *attr,
+static ssize_t store_current_governor(struct device *dev,
+				      struct device_attribute *attr,
 				      const char *buf, size_t count)
 {
 	char gov_name[CPUIDLE_NAME_LEN];
@@ -109,50 +109,48 @@ static ssize_t store_current_governor(struct sysdev_class *class,
 		return count;
 }
 
-static SYSDEV_CLASS_ATTR(current_driver, 0444, show_current_driver, NULL);
-static SYSDEV_CLASS_ATTR(current_governor_ro, 0444, show_current_governor,
-			 NULL);
+static DEVICE_ATTR(current_driver, 0444, show_current_driver, NULL);
+static DEVICE_ATTR(current_governor_ro, 0444, show_current_governor, NULL);
 
-static struct attribute *cpuclass_default_attrs[] = {
-	&attr_current_driver.attr,
-	&attr_current_governor_ro.attr,
+static struct attribute *cpuidle_default_attrs[] = {
+	&dev_attr_current_driver.attr,
+	&dev_attr_current_governor_ro.attr,
 	NULL
 };
 
-static SYSDEV_CLASS_ATTR(available_governors, 0444, show_available_governors,
-			 NULL);
-static SYSDEV_CLASS_ATTR(current_governor, 0644, show_current_governor,
-			 store_current_governor);
+static DEVICE_ATTR(available_governors, 0444, show_available_governors, NULL);
+static DEVICE_ATTR(current_governor, 0644, show_current_governor,
+		   store_current_governor);
 
-static struct attribute *cpuclass_switch_attrs[] = {
-	&attr_available_governors.attr,
-	&attr_current_driver.attr,
-	&attr_current_governor.attr,
+static struct attribute *cpuidle_switch_attrs[] = {
+	&dev_attr_available_governors.attr,
+	&dev_attr_current_driver.attr,
+	&dev_attr_current_governor.attr,
 	NULL
 };
 
-static struct attribute_group cpuclass_attr_group = {
-	.attrs = cpuclass_default_attrs,
+static struct attribute_group cpuidle_attr_group = {
+	.attrs = cpuidle_default_attrs,
 	.name = "cpuidle",
 };
 
 /**
- * cpuidle_add_class_sysfs - add CPU global sysfs attributes
+ * cpuidle_add_interface - add CPU global sysfs attributes
  */
-int cpuidle_add_class_sysfs(struct sysdev_class *cls)
+int cpuidle_add_interface(struct device *dev)
 {
 	if (sysfs_switch)
-		cpuclass_attr_group.attrs = cpuclass_switch_attrs;
+		cpuidle_attr_group.attrs = cpuidle_switch_attrs;
 
-	return sysfs_create_group(&cls->kset.kobj, &cpuclass_attr_group);
+	return sysfs_create_group(&dev->kobj, &cpuidle_attr_group);
 }
 
 /**
- * cpuidle_remove_class_sysfs - remove CPU global sysfs attributes
+ * cpuidle_remove_interface - remove CPU global sysfs attributes
  */
-void cpuidle_remove_class_sysfs(struct sysdev_class *cls)
+void cpuidle_remove_interface(struct device *dev)
 {
-	sysfs_remove_group(&cls->kset.kobj, &cpuclass_attr_group);
+	sysfs_remove_group(&dev->kobj, &cpuidle_attr_group);
 }
 
 struct cpuidle_attr {
@@ -365,16 +363,16 @@ void cpuidle_remove_state_sysfs(struct cpuidle_device *device)
 
 /**
  * cpuidle_add_sysfs - creates a sysfs instance for the target device
- * @sysdev: the target device
+ * @dev: the target device
  */
-int cpuidle_add_sysfs(struct sys_device *sysdev)
+int cpuidle_add_sysfs(struct device *cpu_dev)
 {
-	int cpu = sysdev->id;
+	int cpu = cpu_dev->id;
 	struct cpuidle_device *dev;
 	int error;
 
 	dev = per_cpu(cpuidle_devices, cpu);
-	error = kobject_init_and_add(&dev->kobj, &ktype_cpuidle, &sysdev->kobj,
+	error = kobject_init_and_add(&dev->kobj, &ktype_cpuidle, &cpu_dev->kobj,
 				     "cpuidle");
 	if (!error)
 		kobject_uevent(&dev->kobj, KOBJ_ADD);
@@ -383,11 +381,11 @@ int cpuidle_add_sysfs(struct sys_device *sysdev)
 
 /**
  * cpuidle_remove_sysfs - deletes a sysfs instance on the target device
- * @sysdev: the target device
+ * @dev: the target device
  */
-void cpuidle_remove_sysfs(struct sys_device *sysdev)
+void cpuidle_remove_sysfs(struct device *cpu_dev)
 {
-	int cpu = sysdev->id;
+	int cpu = cpu_dev->id;
 	struct cpuidle_device *dev;
 
 	dev = per_cpu(cpuidle_devices, cpu);
diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c
index 95b909ac2b73..3c03c1060be6 100644
--- a/drivers/s390/char/sclp_config.c
+++ b/drivers/s390/char/sclp_config.c
@@ -11,7 +11,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/cpu.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/workqueue.h>
 #include <asm/smp.h>
 
@@ -31,14 +31,14 @@ static struct work_struct sclp_cpu_change_work;
 static void sclp_cpu_capability_notify(struct work_struct *work)
 {
 	int cpu;
-	struct sys_device *sysdev;
+	struct device *dev;
 
 	s390_adjust_jiffies();
 	pr_warning("cpu capability changed.\n");
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
-		sysdev = get_cpu_sysdev(cpu);
-		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
+		dev = get_cpu_device(cpu);
+		kobject_uevent(&dev->kobj, KOBJ_CHANGE);
 	}
 	put_online_cpus();
 }
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6cb60fd2ea84..fc3da0d70d68 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -14,7 +14,7 @@
 #ifndef _LINUX_CPU_H_
 #define _LINUX_CPU_H_
 
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/cpumask.h>
@@ -22,19 +22,19 @@
 struct cpu {
 	int node_id;		/* The node which contains the CPU */
 	int hotpluggable;	/* creates sysfs control file if hotpluggable */
-	struct sys_device sysdev;
+	struct device dev;
 };
 
 extern int register_cpu(struct cpu *cpu, int num);
-extern struct sys_device *get_cpu_sysdev(unsigned cpu);
+extern struct device *get_cpu_device(unsigned cpu);
 
-extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr);
-extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr);
+extern int cpu_add_dev_attr(struct device_attribute *attr);
+extern void cpu_remove_dev_attr(struct device_attribute *attr);
 
-extern int cpu_add_sysdev_attr_group(struct attribute_group *attrs);
-extern void cpu_remove_sysdev_attr_group(struct attribute_group *attrs);
+extern int cpu_add_dev_attr_group(struct attribute_group *attrs);
+extern void cpu_remove_dev_attr_group(struct attribute_group *attrs);
 
-extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
+extern int sched_create_sysfs_power_savings_entries(struct device *dev);
 
 #ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu(struct cpu *cpu);
@@ -160,7 +160,7 @@ static inline void cpu_maps_update_done(void)
 }
 
 #endif /* CONFIG_SMP */
-extern struct sysdev_class cpu_sysdev_class;
+extern struct bus_type cpu_subsys;
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 0e9344a71be3..530772646443 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7923,54 +7923,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
-					   struct sysdev_class_attribute *attr,
-					   char *page)
+static ssize_t sched_mc_power_savings_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
 {
-	return sprintf(page, "%u\n", sched_mc_power_savings);
+	return sprintf(buf, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
-					    struct sysdev_class_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct device *dev,
+					    struct device_attribute *attr,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
-			 sched_mc_power_savings_show,
-			 sched_mc_power_savings_store);
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
+		   sched_mc_power_savings_show,
+		   sched_mc_power_savings_store);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
-					    struct sysdev_class_attribute *attr,
-					    char *page)
+static ssize_t sched_smt_power_savings_show(struct device *dev,
+					    struct device_attribute *attr,
+					    char *buf)
 {
-	return sprintf(page, "%u\n", sched_smt_power_savings);
+	return sprintf(buf, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
-					     struct sysdev_class_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct device *dev,
+					    struct device_attribute *attr,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
 		   sched_smt_power_savings_show,
 		   sched_smt_power_savings_store);
 #endif
 
-int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
 {
 	int err = 0;
 
 #ifdef CONFIG_SCHED_SMT
 	if (smt_capable())
-		err = sysfs_create_file(&cls->kset.kobj,
-					&attr_sched_smt_power_savings.attr);
+		err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
 #endif
 #ifdef CONFIG_SCHED_MC
 	if (!err && mc_capable())
-		err = sysfs_create_file(&cls->kset.kobj,
-					&attr_sched_mc_power_savings.attr);
+		err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
 #endif
 	return err;
 }
-- 
cgit v1.2.3-71-gd317


From 7239f65cf364180cdb100a4ed211b2a9f9a72119 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Wed, 21 Dec 2011 16:12:37 -0800
Subject: clockevents: remove sysdev.h

This isn't needed in the clockevents.c file, and the header file is
going away soon, so just remove the #include

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/time/clockevents.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ecd6ba36d6c..9cd928f7a7c6 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,7 +17,6 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
-#include <linux/sysdev.h>
 
 #include "tick-internal.h"
 
-- 
cgit v1.2.3-71-gd317


From 026085ef5ae07c3197f2baacc091ce067b86ed11 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Wed, 21 Dec 2011 20:18:35 -0800
Subject: cgroup: remove redundant get/put of old css_set from migrate

We can now assume that the css_set reference held by the task
will not go away for an exiting task. PF_EXITING state can be
trusted throughout migration by checking it after locking
threadgroup.

Changes in V4:
* https://lkml.org/lkml/2011/12/20/368 (Tejun Heo)
  * Fix typo in commit message
  * Undid the rename of css_set_check_fetched
* https://lkml.org/lkml/2011/12/20/427 (Li Zefan)
  * Fix comment in cgroup_task_migrate()
Changes in V3:
* https://lkml.org/lkml/2011/12/20/255 (Frederic Weisbecker)
  * Fixed to put error in retval
Changes in V2:
* https://lkml.org/lkml/2011/12/19/289 (Tejun Heo)
  * Updated commit message

-tj: removed stale patch description about dropped function rename.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: containers@lists.linux-foundation.org
Cc: cgroups@vger.kernel.org
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/cgroup.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4936d8886b4f..82288088f6a5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1850,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 	struct css_set *newcg;
 
 	/*
-	 * get old css_set. We are synchronized through threadgroup_lock()
-	 * against PF_EXITING setting such that we can't race against
-	 * cgroup_exit() changing the css_set to init_css_set and dropping the
-	 * old one.
+	 * We are synchronized through threadgroup_lock() against PF_EXITING
+	 * setting such that we can't race against cgroup_exit() changing the
+	 * css_set to init_css_set and dropping the old one.
 	 */
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	oldcg = tsk->cgroups;
-	get_css_set(oldcg);
 
 	/* locate or allocate a new css_set for this task. */
 	if (guarantee) {
@@ -1872,12 +1870,9 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 		might_sleep();
 		/* find_css_set will give us newcg already referenced. */
 		newcg = find_css_set(oldcg, cgrp);
-		if (!newcg) {
-			put_css_set(oldcg);
+		if (!newcg)
 			return -ENOMEM;
-		}
 	}
-	put_css_set(oldcg);
 
 	task_lock(tsk);
 	rcu_assign_pointer(tsk->cgroups, newcg);
@@ -2186,18 +2181,11 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		 * init_css_set.
 		 */
 		oldcg = tc->task->cgroups;
-		get_css_set(oldcg);
-		/* see if the new one for us is already in the list? */
-		if (css_set_check_fetched(cgrp, tc->task, oldcg, &newcg_list)) {
-			/* was already there, nothing to do. */
-			put_css_set(oldcg);
-		} else {
-			/* we don't already have it. get new one. */
-			retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
-			put_css_set(oldcg);
-			if (retval)
+
+		/* if we don't already have it in the list get a new one */
+		if (!css_set_check_fetched(cgrp, tc->task, oldcg, &newcg_list))
+			if (retval = css_set_prefetch(cgrp, oldcg, &newcg_list))
 				goto out_list_teardown;
-		}
 	}
 
 	/*
-- 
cgit v1.2.3-71-gd317


From b07ef7741122a83575499c11417e514877941e76 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Wed, 21 Dec 2011 20:18:36 -0800
Subject: cgroup: remove redundant get/put of task struct

threadgroup_lock() guarantees that the target threadgroup will
remain stable - no new task will be added, no new PF_EXITING
will be set and exec won't happen.

Changes in V2:
* https://lkml.org/lkml/2011/12/20/369 (Tejun Heo)
  * Undo incorrect removal of get/put from attach_task_by_pid()
* Author
  * Remove a comment which is made stale by this change

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: containers@lists.linux-foundation.org
Cc: cgroups@vger.kernel.org
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/cgroup.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 82288088f6a5..a85a7002ca33 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2116,7 +2116,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		retval = -EAGAIN;
 		goto out_free_group_list;
 	}
-	/* take a reference on each task in the group to go in the array. */
+
 	tsk = leader;
 	i = nr_migrating_tasks = 0;
 	do {
@@ -2128,7 +2128,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 
 		/* as per above, nr_threads may decrease, but not increase. */
 		BUG_ON(i >= group_size);
-		get_task_struct(tsk);
 		/*
 		 * saying GFP_ATOMIC has no effect here because we did prealloc
 		 * earlier, but it's good form to communicate our expectations.
@@ -2150,7 +2149,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	/* methods shouldn't be called if no task is actually migrating */
 	retval = 0;
 	if (!nr_migrating_tasks)
-		goto out_put_tasks;
+		goto out_free_group_list;
 
 	/*
 	 * step 1: check that we can legitimately attach to the cgroup.
@@ -2234,12 +2233,6 @@ out_cancel_attach:
 				ss->cancel_attach(ss, cgrp, &tset);
 		}
 	}
-out_put_tasks:
-	/* clean up the array of referenced threads in the group. */
-	for (i = 0; i < group_size; i++) {
-		tc = flex_array_get(group, i);
-		put_task_struct(tc->task);
-	}
 out_free_group_list:
 	flex_array_free(group);
 	return retval;
-- 
cgit v1.2.3-71-gd317


From 892a2b90ba15cb7dbee40979f23fdb492913abf8 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Wed, 21 Dec 2011 20:18:37 -0800
Subject: cgroup: only need to check oldcgrp==newgrp once

In cgroup_attach_proc it is now sufficient to only check that
oldcgrp==newcgrp once. Now that we are using threadgroup_lock()
during the migrations, oldcgrp will not change.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: containers@lists.linux-foundation.org
Cc: cgroups@vger.kernel.org
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/cgroup.c | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a85a7002ca33..1042b3c41314 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2067,7 +2067,7 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
  */
 int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 {
-	int retval, i, group_size, nr_migrating_tasks;
+	int retval, i, group_size;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
 	/* guaranteed to be initialized later, but the compiler needs this */
 	struct css_set *oldcg;
@@ -2118,7 +2118,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	}
 
 	tsk = leader;
-	i = nr_migrating_tasks = 0;
+	i = 0;
 	do {
 		struct task_and_cgroup ent;
 
@@ -2134,11 +2134,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		 */
 		ent.task = tsk;
 		ent.cgrp = task_cgroup_from_root(tsk, root);
+		/* nothing to do if this task is already in the cgroup */
+		if (ent.cgrp == cgrp)
+			continue;
 		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
 		BUG_ON(retval != 0);
 		i++;
-		if (ent.cgrp != cgrp)
-			nr_migrating_tasks++;
 	} while_each_thread(leader, tsk);
 	/* remember the number of threads in the array for later. */
 	group_size = i;
@@ -2148,7 +2149,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 
 	/* methods shouldn't be called if no task is actually migrating */
 	retval = 0;
-	if (!nr_migrating_tasks)
+	if (!group_size)
 		goto out_free_group_list;
 
 	/*
@@ -2171,14 +2172,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	INIT_LIST_HEAD(&newcg_list);
 	for (i = 0; i < group_size; i++) {
 		tc = flex_array_get(group, i);
-		/* nothing to do if this task is already in the cgroup */
-		if (tc->cgrp == cgrp)
-			continue;
-		/*
-		 * get old css_set pointer. threadgroup is locked so this is
-		 * safe against concurrent cgroup_exit() changing this to
-		 * init_css_set.
-		 */
 		oldcg = tc->task->cgroups;
 
 		/* if we don't already have it in the list get a new one */
@@ -2194,9 +2187,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 */
 	for (i = 0; i < group_size; i++) {
 		tc = flex_array_get(group, i);
-		/* leave current thread as it is if it's already there */
-		if (tc->cgrp == cgrp)
-			continue;
 		retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
 		BUG_ON(retval);
 	}
-- 
cgit v1.2.3-71-gd317


From c87fb57346fc7653ace98769f148e0dcd88ac1ee Mon Sep 17 00:00:00 2001
From: Jamie Iles <jamie@jamieiles.com>
Date: Wed, 14 Dec 2011 23:43:16 +0100
Subject: ARM: 7235/1: irqdomain: export irq_domain_simple_ops for !CONFIG_OF

irqdomain support is used in interrupt controller drivers that may not
have device tree support but only need the basic HW->Linux irq
translation.  Rather than having each of these implement their own IRQ
domain, allow them to use the simple ops.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rob Herring <robherring2@gmail.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Jamie Iles <jamie@jamieiles.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/irqdomain.h |  3 ++-
 kernel/irq/irqdomain.c    | 12 +++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 99834e581b9e..bd4272b61a14 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -91,10 +91,11 @@ static inline unsigned int irq_domain_to_irq(struct irq_domain *d,
 
 extern void irq_domain_add(struct irq_domain *domain);
 extern void irq_domain_del(struct irq_domain *domain);
+
+extern struct irq_domain_ops irq_domain_simple_ops;
 #endif /* CONFIG_IRQ_DOMAIN */
 
 #if defined(CONFIG_IRQ_DOMAIN) && defined(CONFIG_OF_IRQ)
-extern struct irq_domain_ops irq_domain_simple_ops;
 extern void irq_domain_add_simple(struct device_node *controller, int irq_base);
 extern void irq_domain_generate_simple(const struct of_device_id *match,
 					u64 phys_base, unsigned int irq_start);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 200ce832c585..7ca523b249ef 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -143,11 +143,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
 	return 0;
 }
 
-struct irq_domain_ops irq_domain_simple_ops = {
-	.dt_translate = irq_domain_simple_dt_translate,
-};
-EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
-
 /**
  * irq_domain_create_simple() - Set up a 'simple' translation range
  */
@@ -182,3 +177,10 @@ void irq_domain_generate_simple(const struct of_device_id *match,
 }
 EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
 #endif /* CONFIG_OF_IRQ */
+
+struct irq_domain_ops irq_domain_simple_ops = {
+#ifdef CONFIG_OF_IRQ
+	.dt_translate = irq_domain_simple_dt_translate,
+#endif /* CONFIG_OF_IRQ */
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
-- 
cgit v1.2.3-71-gd317


From a65cf5181aa608addcb2873c8ed90413a9f539cb Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2011 20:39:59 +0800
Subject: jump-label: export jump_label_inc/jump_label_dec

Export these two symbols, they will be used by KVM mmu audit

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 kernel/jump_label.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 66ff7109f697..2af9027106a8 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -71,6 +71,7 @@ void jump_label_inc(struct jump_label_key *key)
 	atomic_inc(&key->enabled);
 	jump_label_unlock();
 }
+EXPORT_SYMBOL_GPL(jump_label_inc);
 
 void jump_label_dec(struct jump_label_key *key)
 {
@@ -80,6 +81,7 @@ void jump_label_dec(struct jump_label_key *key)
 	jump_label_update(key, JUMP_LABEL_DISABLE);
 	jump_label_unlock();
 }
+EXPORT_SYMBOL_GPL(jump_label_dec);
 
 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 {
-- 
cgit v1.2.3-71-gd317


From 1c6c3fad81787e8cb4c85ddfd573b0d8442fe630 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Tue, 27 Dec 2011 07:46:25 +0200
Subject: cgroup: mark cgroup_rmdir_waitq and cgroup_attach_proc() as static

Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1042b3c41314..421557fcbfe4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -938,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
  *
  * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
  */
-DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
 
 static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
 {
@@ -2065,7 +2065,7 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
  * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
  * task_lock of each thread in leader's threadgroup individually in turn.
  */
-int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 {
 	int retval, i, group_size;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
-- 
cgit v1.2.3-71-gd317


From c6ca57500c23d57a4ccec9874b6a3c99c297e1b5 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Tue, 27 Dec 2011 07:46:26 +0200
Subject: cgroup: add sparse annotation to cgroup_iter_start() and
 cgroup_iter_end()

Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 421557fcbfe4..c6bd67b3fcf6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2825,6 +2825,7 @@ static void cgroup_enable_task_cg_lists(void)
 }
 
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
+	__acquires(css_set_lock)
 {
 	/*
 	 * The first time anyone tries to iterate across a cgroup,
@@ -2864,6 +2865,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 }
 
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
+	__releases(css_set_lock)
 {
 	read_unlock(&css_set_lock);
 }
-- 
cgit v1.2.3-71-gd317


From 7e3aa30ac8c904a706518b725c451bb486daaae9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 23 Dec 2011 04:25:23 +0100
Subject: cgroup: Remove task_lock() from cgroup_post_fork()

cgroup_post_fork() is protected between threadgroup_change_begin()
and threadgroup_change_end() against concurrent changes of the
child's css_set in cgroup_task_migrate(). Also the child can't
exit and call cgroup_exit() at this stage, this means it's css_set
can't be changed with init_css_set concurrently.

For these reasons, we don't need to hold task_lock() on the child
because it's css_set can only remain stable in this place.

Let's remove the lock there.

v2: Update comment to explain that we are safe against
cgroup_exit()

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Containers <containers@lists.linux-foundation.org>
Cc: Cgroups <cgroups@vger.kernel.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
Cc: Mandeep Singh Baines <msb@chromium.org>
---
 kernel/cgroup.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c6bd67b3fcf6..548d8d4e86d0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4595,10 +4595,19 @@ void cgroup_post_fork(struct task_struct *child)
 {
 	if (use_task_css_set_links) {
 		write_lock(&css_set_lock);
-		task_lock(child);
-		if (list_empty(&child->cg_list))
+		if (list_empty(&child->cg_list)) {
+			/*
+			 * It's safe to use child->cgroups without task_lock()
+			 * here because we are protected through
+			 * threadgroup_change_begin() against concurrent
+			 * css_set change in cgroup_task_migrate(). Also
+			 * the task can't exit at that point until
+			 * wake_up_new_task() is called, so we are protected
+			 * against cgroup_exit() setting child->cgroup to
+			 * init_css_set.
+			 */
 			list_add(&child->cg_list, &child->cgroups->tasks);
-		task_unlock(child);
+		}
 		write_unlock(&css_set_lock);
 	}
 }
-- 
cgit v1.2.3-71-gd317


From 93797d87d63d36404907640e4e20bb976bff4744 Mon Sep 17 00:00:00 2001
From: Rob Herring <rob.herring@calxeda.com>
Date: Mon, 12 Dec 2011 09:59:14 -0600
Subject: irq: check domain hwirq range for DT translate

A DT node may have more than 1 domain associated with it, so make sure
the hwirq number is within range when doing DT translation.

Signed-off-by: Rob Herring <rob.herring@calxeda.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Shawn Guo <shawn.guo@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdomain.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 200ce832c585..8bd7479b8390 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
 		return -EINVAL;
 	if (intsize < 1)
 		return -EINVAL;
+	if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
+	    (intspec[0] >= d->hwirq_base + d->nr_irq)))
+		return -EINVAL;
 
 	*out_hwirq = intspec[0];
 	*out_type = IRQ_TYPE_NONE;
-- 
cgit v1.2.3-71-gd317


From d36b691077dc59c74efec0d54ed21b86f7a2a21a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 29 Dec 2011 17:09:01 -0500
Subject: misc latin1 to utf8 conversions

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ABI/testing/sysfs-driver-hid-logitech-lg4ff | 2 +-
 Documentation/vgaarbiter.txt                              | 2 +-
 drivers/gpu/drm/i915/intel_sdvo_regs.h                    | 2 +-
 drivers/hid/hid-pl.c                                      | 4 ++--
 drivers/hid/hid-twinhan.c                                 | 2 +-
 drivers/i2c/busses/i2c-xiic.c                             | 2 +-
 drivers/isdn/hisax/enternow_pci.c                         | 2 +-
 drivers/media/rc/winbond-cir.c                            | 2 +-
 drivers/misc/ibmasm/command.c                             | 2 +-
 drivers/misc/ibmasm/dot_command.c                         | 2 +-
 drivers/misc/ibmasm/dot_command.h                         | 2 +-
 drivers/misc/ibmasm/event.c                               | 2 +-
 drivers/misc/ibmasm/heartbeat.c                           | 2 +-
 drivers/misc/ibmasm/i2o.h                                 | 2 +-
 drivers/misc/ibmasm/ibmasm.h                              | 2 +-
 drivers/misc/ibmasm/ibmasmfs.c                            | 2 +-
 drivers/misc/ibmasm/lowlevel.c                            | 2 +-
 drivers/misc/ibmasm/lowlevel.h                            | 2 +-
 drivers/misc/ibmasm/module.c                              | 2 +-
 drivers/misc/ibmasm/r_heartbeat.c                         | 2 +-
 drivers/misc/ibmasm/remote.h                              | 2 +-
 drivers/misc/ibmasm/uart.c                                | 2 +-
 drivers/mmc/host/omap.c                                   | 4 ++--
 drivers/scsi/jazz_esp.c                                   | 2 +-
 drivers/scsi/sni_53c710.c                                 | 2 +-
 drivers/staging/bcm/target_params.h                       | 4 ++--
 drivers/staging/keucr/smilmain.c                          | 4 ++--
 drivers/staging/wlags49_h2/debug.h                        | 2 +-
 drivers/staging/wlags49_h2/dhfcfg.h                       | 2 +-
 drivers/staging/wlags49_h2/hcf.c                          | 6 +++---
 drivers/staging/wlags49_h2/hcf.h                          | 6 +++---
 drivers/staging/wlags49_h2/hcfcfg.h                       | 6 +++---
 drivers/staging/wlags49_h2/hcfdef.h                       | 6 +++---
 drivers/staging/wlags49_h2/mdd.h                          | 6 +++---
 drivers/staging/wlags49_h2/mmd.c                          | 2 +-
 drivers/staging/wlags49_h2/mmd.h                          | 2 +-
 drivers/staging/wlags49_h2/wl_cs.h                        | 4 ++--
 drivers/staging/wlags49_h2/wl_enc.c                       | 4 ++--
 drivers/staging/wlags49_h2/wl_enc.h                       | 4 ++--
 drivers/staging/wlags49_h2/wl_if.h                        | 4 ++--
 drivers/staging/wlags49_h2/wl_internal.h                  | 4 ++--
 drivers/staging/wlags49_h2/wl_main.c                      | 4 ++--
 drivers/staging/wlags49_h2/wl_main.h                      | 4 ++--
 drivers/staging/wlags49_h2/wl_netdev.c                    | 4 ++--
 drivers/staging/wlags49_h2/wl_netdev.h                    | 4 ++--
 drivers/staging/wlags49_h2/wl_pci.c                       | 4 ++--
 drivers/staging/wlags49_h2/wl_pci.h                       | 4 ++--
 drivers/staging/wlags49_h2/wl_priv.c                      | 4 ++--
 drivers/staging/wlags49_h2/wl_priv.h                      | 4 ++--
 drivers/staging/wlags49_h2/wl_profile.c                   | 4 ++--
 drivers/staging/wlags49_h2/wl_profile.h                   | 4 ++--
 drivers/staging/wlags49_h2/wl_util.c                      | 4 ++--
 drivers/staging/wlags49_h2/wl_util.h                      | 4 ++--
 drivers/staging/wlags49_h2/wl_version.h                   | 4 ++--
 drivers/staging/wlags49_h2/wl_wext.c                      | 4 ++--
 drivers/staging/wlags49_h2/wl_wext.h                      | 4 ++--
 drivers/usb/gadget/s3c2410_udc.c                          | 4 ++--
 drivers/usb/gadget/s3c2410_udc.h                          | 2 +-
 drivers/usb/serial/ftdi_sio_ids.h                         | 6 +++---
 drivers/usb/storage/isd200.c                              | 2 +-
 drivers/usb/storage/unusual_devs.h                        | 2 +-
 drivers/video/omap/rfbi.c                                 | 2 +-
 drivers/video/omap/sossi.c                                | 2 +-
 drivers/watchdog/w83627hf_wdt.c                           | 4 ++--
 drivers/zorro/zorro.ids                                   | 2 +-
 kernel/events/core.c                                      | 2 +-
 kernel/events/ring_buffer.c                               | 2 +-
 tools/testing/ktest/sample.conf                           | 2 +-
 68 files changed, 106 insertions(+), 106 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-driver-hid-logitech-lg4ff b/Documentation/ABI/testing/sysfs-driver-hid-logitech-lg4ff
index 9aec8ef228b0..167d9032b970 100644
--- a/Documentation/ABI/testing/sysfs-driver-hid-logitech-lg4ff
+++ b/Documentation/ABI/testing/sysfs-driver-hid-logitech-lg4ff
@@ -1,7 +1,7 @@
 What:		/sys/module/hid_logitech/drivers/hid:logitech/<dev>/range.
 Date:		July 2011
 KernelVersion:	3.2
-Contact:	Michal Mal� <madcatxster@gmail.com>
+Contact:	Michal Malý <madcatxster@gmail.com>
 Description:	Display minimum, maximum and current range of the steering
 		wheel. Writing a value within min and max boundaries sets the
 		range of the wheel.
diff --git a/Documentation/vgaarbiter.txt b/Documentation/vgaarbiter.txt
index b7d401e0eae9..014423e2824c 100644
--- a/Documentation/vgaarbiter.txt
+++ b/Documentation/vgaarbiter.txt
@@ -177,7 +177,7 @@ II. Credits
 
 Benjamin Herrenschmidt (IBM?) started this work when he discussed such design
 with the Xorg community in 2005 [1, 2]. In the end of 2007, Paulo Zanoni and
-Tiago Vignatti (both of C3SL/Federal University of Paran�) proceeded his work
+Tiago Vignatti (both of C3SL/Federal University of Paraná) proceeded his work
 enhancing the kernel code to adapt as a kernel module and also did the
 implementation of the user space side [3]. Now (2009) Tiago Vignatti and Dave
 Airlie finally put this work in shape and queued to Jesse Barnes' PCI tree.
diff --git a/drivers/gpu/drm/i915/intel_sdvo_regs.h b/drivers/gpu/drm/i915/intel_sdvo_regs.h
index 4aa6f343e49a..6b7b22f4d63e 100644
--- a/drivers/gpu/drm/i915/intel_sdvo_regs.h
+++ b/drivers/gpu/drm/i915/intel_sdvo_regs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright � 2006-2007 Intel Corporation
+ * Copyright © 2006-2007 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
diff --git a/drivers/hid/hid-pl.c b/drivers/hid/hid-pl.c
index 070f93a5c11b..47ed74c46b6b 100644
--- a/drivers/hid/hid-pl.c
+++ b/drivers/hid/hid-pl.c
@@ -9,10 +9,10 @@
  *   - contains two reports, one for each port (HID_QUIRK_MULTI_INPUT)
  *
  *  0e8f:0003 "GreenAsia Inc.    USB Joystick     "
- *   - tested with K�nig Gaming gamepad
+ *   - tested with König Gaming gamepad
  *
  *  0e8f:0003 "GASIA USB Gamepad"
- *   - another version of the K�nig gamepad
+ *   - another version of the König gamepad
  *
  *  Copyright (c) 2007, 2009 Anssi Hannula <anssi.hannula@gmail.com>
  */
diff --git a/drivers/hid/hid-twinhan.c b/drivers/hid/hid-twinhan.c
index c40afc57fc8f..f23456b1fd4b 100644
--- a/drivers/hid/hid-twinhan.c
+++ b/drivers/hid/hid-twinhan.c
@@ -3,7 +3,7 @@
  *
  * Based on hid-gyration.c
  *
- * Copyright (c) 2009 Bruno Pr�mont <bonbons@linux-vserver.org>
+ * Copyright (c) 2009 Bruno Prémont <bonbons@linux-vserver.org>
  */
 
 /*
diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c
index 4bb68f35caf2..ac083a28ae08 100644
--- a/drivers/i2c/busses/i2c-xiic.c
+++ b/drivers/i2c/busses/i2c-xiic.c
@@ -426,7 +426,7 @@ static void xiic_process(struct xiic_i2c *i2c)
 			xiic_wakeup(i2c, STATE_ERROR);
 
 	} else if (pend & (XIIC_INTR_TX_EMPTY_MASK | XIIC_INTR_TX_HALF_MASK)) {
-		/* Transmit register/FIFO is empty or � empty */
+		/* Transmit register/FIFO is empty or ½ empty */
 
 		clr = pend &
 			(XIIC_INTR_TX_EMPTY_MASK | XIIC_INTR_TX_HALF_MASK);
diff --git a/drivers/isdn/hisax/enternow_pci.c b/drivers/isdn/hisax/enternow_pci.c
index 26264abf1f58..f55d29d60826 100644
--- a/drivers/isdn/hisax/enternow_pci.c
+++ b/drivers/isdn/hisax/enternow_pci.c
@@ -333,7 +333,7 @@ static void __devinit en_cs_init(struct IsdnCard *card,
 	cs->hw.njet.isac = cs->hw.njet.base + 0xC0; // Fenster zum AMD
 
 	/* Reset an */
-	cs->hw.njet.ctrl_reg = 0x07;  // ge�ndert von 0xff
+	cs->hw.njet.ctrl_reg = 0x07;  // geändert von 0xff
 	outb(cs->hw.njet.ctrl_reg, cs->hw.njet.base + NETJET_CTRL);
 	/* 20 ms Pause */
 	mdelay(20);
diff --git a/drivers/media/rc/winbond-cir.c b/drivers/media/rc/winbond-cir.c
index 13f54b51194a..e7f7a57bf684 100644
--- a/drivers/media/rc/winbond-cir.c
+++ b/drivers/media/rc/winbond-cir.c
@@ -1176,6 +1176,6 @@ wbcir_exit(void)
 module_init(wbcir_init);
 module_exit(wbcir_exit);
 
-MODULE_AUTHOR("David H�rdeman <david@hardeman.nu>");
+MODULE_AUTHOR("David Härdeman <david@hardeman.nu>");
 MODULE_DESCRIPTION("Winbond SuperI/O Consumer IR Driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/misc/ibmasm/command.c b/drivers/misc/ibmasm/command.c
index 5c766b4fb238..7d56f45dee19 100644
--- a/drivers/misc/ibmasm/command.c
+++ b/drivers/misc/ibmasm/command.c
@@ -18,7 +18,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/dot_command.c b/drivers/misc/ibmasm/dot_command.c
index 3dd2dfb8da17..d7b2ca358b23 100644
--- a/drivers/misc/ibmasm/dot_command.c
+++ b/drivers/misc/ibmasm/dot_command.c
@@ -17,7 +17,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/dot_command.h b/drivers/misc/ibmasm/dot_command.h
index 6cbba1afef35..fc9fc9d4e087 100644
--- a/drivers/misc/ibmasm/dot_command.h
+++ b/drivers/misc/ibmasm/dot_command.h
@@ -17,7 +17,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/event.c b/drivers/misc/ibmasm/event.c
index 76bfda1ffaa9..8e540f4e9d52 100644
--- a/drivers/misc/ibmasm/event.c
+++ b/drivers/misc/ibmasm/event.c
@@ -18,7 +18,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/heartbeat.c b/drivers/misc/ibmasm/heartbeat.c
index 1bc4306572a4..90746378f9b7 100644
--- a/drivers/misc/ibmasm/heartbeat.c
+++ b/drivers/misc/ibmasm/heartbeat.c
@@ -18,7 +18,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/i2o.h b/drivers/misc/ibmasm/i2o.h
index bf2c738d2b72..2e9566dab2b1 100644
--- a/drivers/misc/ibmasm/i2o.h
+++ b/drivers/misc/ibmasm/i2o.h
@@ -17,7 +17,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/ibmasm.h b/drivers/misc/ibmasm/ibmasm.h
index 4d8a4e248b34..9b083448814d 100644
--- a/drivers/misc/ibmasm/ibmasm.h
+++ b/drivers/misc/ibmasm/ibmasm.h
@@ -18,7 +18,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 89947723a27d..35361753b487 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -17,7 +17,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/lowlevel.c b/drivers/misc/ibmasm/lowlevel.c
index 4b2398e27fd5..5319ea261c05 100644
--- a/drivers/misc/ibmasm/lowlevel.c
+++ b/drivers/misc/ibmasm/lowlevel.c
@@ -17,7 +17,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/lowlevel.h b/drivers/misc/ibmasm/lowlevel.h
index 766766523a60..e97848f51b3f 100644
--- a/drivers/misc/ibmasm/lowlevel.h
+++ b/drivers/misc/ibmasm/lowlevel.h
@@ -17,7 +17,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/module.c b/drivers/misc/ibmasm/module.c
index a234d965243b..1ccedb71e728 100644
--- a/drivers/misc/ibmasm/module.c
+++ b/drivers/misc/ibmasm/module.c
@@ -18,7 +18,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  * This driver is based on code originally written by Pete Reynolds
  * and others.
diff --git a/drivers/misc/ibmasm/r_heartbeat.c b/drivers/misc/ibmasm/r_heartbeat.c
index 2de487ac788c..232034f5da48 100644
--- a/drivers/misc/ibmasm/r_heartbeat.c
+++ b/drivers/misc/ibmasm/r_heartbeat.c
@@ -16,7 +16,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/misc/ibmasm/remote.h b/drivers/misc/ibmasm/remote.h
index 00dbf1d4373a..a7729ef76acb 100644
--- a/drivers/misc/ibmasm/remote.h
+++ b/drivers/misc/ibmasm/remote.h
@@ -18,7 +18,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  * Originally written by Pete Reynolds
  */
diff --git a/drivers/misc/ibmasm/uart.c b/drivers/misc/ibmasm/uart.c
index 93baa350d698..1dcb9ae1905a 100644
--- a/drivers/misc/ibmasm/uart.c
+++ b/drivers/misc/ibmasm/uart.c
@@ -18,7 +18,7 @@
  *
  * Copyright (C) IBM Corporation, 2004
  *
- * Author: Max Asb�ck <amax@us.ibm.com>
+ * Author: Max Asböck <amax@us.ibm.com>
  *
  */
 
diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c
index 2dba999caf2c..887c0e598cf3 100644
--- a/drivers/mmc/host/omap.c
+++ b/drivers/mmc/host/omap.c
@@ -2,7 +2,7 @@
  *  linux/drivers/mmc/host/omap.c
  *
  *  Copyright (C) 2004 Nokia Corporation
- *  Written by Tuukka Tikkanen and Juha Yrj�l�<juha.yrjola@nokia.com>
+ *  Written by Tuukka Tikkanen and Juha Yrjölä<juha.yrjola@nokia.com>
  *  Misc hacks here and there by Tony Lindgren <tony@atomide.com>
  *  Other hacks (DMA, SD, etc) by David Brownell
  *
@@ -1634,4 +1634,4 @@ module_exit(mmc_omap_exit);
 MODULE_DESCRIPTION("OMAP Multimedia Card driver");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:" DRIVER_NAME);
-MODULE_AUTHOR("Juha Yrj�l�");
+MODULE_AUTHOR("Juha Yrjölä");
diff --git a/drivers/scsi/jazz_esp.c b/drivers/scsi/jazz_esp.c
index 08e26d4e3731..27cfb0cb186c 100644
--- a/drivers/scsi/jazz_esp.c
+++ b/drivers/scsi/jazz_esp.c
@@ -1,6 +1,6 @@
 /* jazz_esp.c: ESP front-end for MIPS JAZZ systems.
  *
- * Copyright (C) 2007 Thomas Bogend�rfer (tsbogend@alpha.frankende)
+ * Copyright (C) 2007 Thomas Bogendörfer (tsbogend@alpha.frankende)
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/scsi/sni_53c710.c b/drivers/scsi/sni_53c710.c
index 9acc2b2a3601..cf51432f8e72 100644
--- a/drivers/scsi/sni_53c710.c
+++ b/drivers/scsi/sni_53c710.c
@@ -51,7 +51,7 @@
 
 #include "53c700.h"
 
-MODULE_AUTHOR("Thomas Bogend�rfer");
+MODULE_AUTHOR("Thomas Bogendörfer");
 MODULE_DESCRIPTION("SNI RM 53c710 SCSI Driver");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:snirm_53c710");
diff --git a/drivers/staging/bcm/target_params.h b/drivers/staging/bcm/target_params.h
index 2d8b8a367b38..14876388b879 100644
--- a/drivers/staging/bcm/target_params.h
+++ b/drivers/staging/bcm/target_params.h
@@ -72,8 +72,8 @@ typedef struct _TARGET_PARAMS
 	// removed SHUT down related 'unused' params from here to sync 4.x and 5.x CFG files..
 
     //BAMC Related Parameters
-    //Bit 0-15 Band AMC signaling configuration: Bit 1 = 1 � Enable Band AMC signaling.
-    //bit 16-31 Band AMC Data configuration: Bit 16 = 1 � Band AMC 2x3 support.
+    //Bit 0-15 Band AMC signaling configuration: Bit 1 = 1  Enable Band AMC signaling.
+    //bit 16-31 Band AMC Data configuration: Bit 16 = 1  Band AMC 2x3 support.
 	B_UINT32 m_u32BandAMCEnable;
 
 } stTargetParams,TARGET_PARAMS,*PTARGET_PARAMS, STARGETPARAMS, *PSTARGETPARAMS;
diff --git a/drivers/staging/keucr/smilmain.c b/drivers/staging/keucr/smilmain.c
index 31f7813cab0d..cc49038e55d6 100644
--- a/drivers/staging/keucr/smilmain.c
+++ b/drivers/staging/keucr/smilmain.c
@@ -148,7 +148,7 @@ int Media_D_ReadSector(struct us_data *us, DWORD start,WORD count,BYTE *buf)
 {
 	WORD len, bn;
 
-	//if (Check_D_MediaPower())        ; �b 6250 don't care
+	//if (Check_D_MediaPower())        ; ¦b 6250 don't care
 	//    return(ErrCode);
 	//if (Check_D_MediaFmt(fdoExt))    ;
 	//    return(ErrCode);
@@ -594,7 +594,7 @@ int Media_D_OneSectWriteFlush(PFDO_DEVICE_EXTENSION fdoExt)
 //    if (Check_D_CardStsChg())
 //        MediaChange = ERROR;
 //    //usleep(56*1024);
-//    if ((!Check_D_CntPower())&&(!MediaChange))  // �� power & Media �S�Q change, �h return success
+//    if ((!Check_D_CntPower())&&(!MediaChange))  // ¦³ power & Media ¨S³Q change, «h return success
 //        return(SMSUCCESS);
 //    //usleep(56*1024);
 //
diff --git a/drivers/staging/wlags49_h2/debug.h b/drivers/staging/wlags49_h2/debug.h
index 8d5dddf08055..811698f1070c 100644
--- a/drivers/staging/wlags49_h2/debug.h
+++ b/drivers/staging/wlags49_h2/debug.h
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/dhfcfg.h b/drivers/staging/wlags49_h2/dhfcfg.h
index 75c279f268ae..147f4c83c00c 100644
--- a/drivers/staging/wlags49_h2/dhfcfg.h
+++ b/drivers/staging/wlags49_h2/dhfcfg.h
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/hcf.c b/drivers/staging/wlags49_h2/hcf.c
index 7dc176a95aab..b008773323b3 100644
--- a/drivers/staging/wlags49_h2/hcf.c
+++ b/drivers/staging/wlags49_h2/hcf.c
@@ -32,9 +32,9 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * COPYRIGHT � 1994 - 1995   by AT&T.                All Rights Reserved
- * COPYRIGHT � 1996 - 2000 by Lucent Technologies.   All Rights Reserved
- * COPYRIGHT � 2001 - 2004   by Agere Systems Inc.   All Rights Reserved
+ * COPYRIGHT © 1994 - 1995   by AT&T.                All Rights Reserved
+ * COPYRIGHT © 1996 - 2000 by Lucent Technologies.   All Rights Reserved
+ * COPYRIGHT © 2001 - 2004   by Agere Systems Inc.   All Rights Reserved
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
diff --git a/drivers/staging/wlags49_h2/hcf.h b/drivers/staging/wlags49_h2/hcf.h
index 000994731166..95527b5cf863 100644
--- a/drivers/staging/wlags49_h2/hcf.h
+++ b/drivers/staging/wlags49_h2/hcf.h
@@ -40,9 +40,9 @@
 * software indicates your acceptance of these terms and conditions.  If you do
 * not agree with these terms and conditions, do not use the software.
 *
-* COPYRIGHT � 1994 - 1995	by AT&T.				All Rights Reserved
-* COPYRIGHT � 1996 - 2000 by Lucent Technologies.	All Rights Reserved
-* COPYRIGHT � 2001 - 2004	by Agere Systems Inc.	All Rights Reserved
+* COPYRIGHT © 1994 - 1995	by AT&T.				All Rights Reserved
+* COPYRIGHT © 1996 - 2000 by Lucent Technologies.	All Rights Reserved
+* COPYRIGHT © 2001 - 2004	by Agere Systems Inc.	All Rights Reserved
 * All rights reserved.
 *
 * Redistribution and use in source or binary forms, with or without
diff --git a/drivers/staging/wlags49_h2/hcfcfg.h b/drivers/staging/wlags49_h2/hcfcfg.h
index 7545bc554112..ef60da8c3ebc 100644
--- a/drivers/staging/wlags49_h2/hcfcfg.h
+++ b/drivers/staging/wlags49_h2/hcfcfg.h
@@ -64,9 +64,9 @@
 * software indicates your acceptance of these terms and conditions.  If you do
 * not agree with these terms and conditions, do not use the software.
 *
-* COPYRIGHT � 1994 - 1995	by AT&T.				All Rights Reserved
-* COPYRIGHT � 1996 - 2000 by Lucent Technologies.	All Rights Reserved
-* COPYRIGHT � 2001 - 2004	by Agere Systems Inc.	All Rights Reserved
+* COPYRIGHT © 1994 - 1995	by AT&T.				All Rights Reserved
+* COPYRIGHT © 1996 - 2000 by Lucent Technologies.	All Rights Reserved
+* COPYRIGHT © 2001 - 2004	by Agere Systems Inc.	All Rights Reserved
 * All rights reserved.
 *
 * Redistribution and use in source or binary forms, with or without
diff --git a/drivers/staging/wlags49_h2/hcfdef.h b/drivers/staging/wlags49_h2/hcfdef.h
index a62b53a22891..30744e194a23 100644
--- a/drivers/staging/wlags49_h2/hcfdef.h
+++ b/drivers/staging/wlags49_h2/hcfdef.h
@@ -33,9 +33,9 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * COPYRIGHT � 1994 - 1995   by AT&T.                All Rights Reserved
- * COPYRIGHT � 1996 - 2000 by Lucent Technologies.   All Rights Reserved
- * COPYRIGHT � 2001 - 2004   by Agere Systems Inc.   All Rights Reserved
+ * COPYRIGHT © 1994 - 1995   by AT&T.                All Rights Reserved
+ * COPYRIGHT © 1996 - 2000 by Lucent Technologies.   All Rights Reserved
+ * COPYRIGHT © 2001 - 2004   by Agere Systems Inc.   All Rights Reserved
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
diff --git a/drivers/staging/wlags49_h2/mdd.h b/drivers/staging/wlags49_h2/mdd.h
index b02e3ea9e479..5f951efb9c07 100644
--- a/drivers/staging/wlags49_h2/mdd.h
+++ b/drivers/staging/wlags49_h2/mdd.h
@@ -33,9 +33,9 @@
 * software indicates your acceptance of these terms and conditions.  If you do
 * not agree with these terms and conditions, do not use the software.
 *
-* COPYRIGHT � 1994 - 1995	by AT&T.				All Rights Reserved
-* COPYRIGHT � 1996 - 2000 by Lucent Technologies.	All Rights Reserved
-* COPYRIGHT � 2001 - 2004	by Agere Systems Inc.	All Rights Reserved
+* COPYRIGHT © 1994 - 1995	by AT&T.				All Rights Reserved
+* COPYRIGHT © 1996 - 2000 by Lucent Technologies.	All Rights Reserved
+* COPYRIGHT © 2001 - 2004	by Agere Systems Inc.	All Rights Reserved
 * All rights reserved.
 *
 * Redistribution and use in source or binary forms, with or without
diff --git a/drivers/staging/wlags49_h2/mmd.c b/drivers/staging/wlags49_h2/mmd.c
index de138c481a9e..c8f52107e6ca 100644
--- a/drivers/staging/wlags49_h2/mmd.c
+++ b/drivers/staging/wlags49_h2/mmd.c
@@ -35,7 +35,7 @@
 * software indicates your acceptance of these terms and conditions.  If you do
 * not agree with these terms and conditions, do not use the software.
 *
-* COPYRIGHT � 2001 - 2004	by Agere Systems Inc.	All Rights Reserved
+* COPYRIGHT © 2001 - 2004	by Agere Systems Inc.	All Rights Reserved
 * All rights reserved.
 *
 * Redistribution and use in source or binary forms, with or without
diff --git a/drivers/staging/wlags49_h2/mmd.h b/drivers/staging/wlags49_h2/mmd.h
index 06890c1b30a4..914952513005 100644
--- a/drivers/staging/wlags49_h2/mmd.h
+++ b/drivers/staging/wlags49_h2/mmd.h
@@ -33,7 +33,7 @@
 * software indicates your acceptance of these terms and conditions.  If you do
 * not agree with these terms and conditions, do not use the software.
 *
-* COPYRIGHT � 2001 - 2004	by Agere Systems Inc.	All Rights Reserved
+* COPYRIGHT © 2001 - 2004	by Agere Systems Inc.	All Rights Reserved
 * All rights reserved.
 *
 * Redistribution and use in source or binary forms, with or without
diff --git a/drivers/staging/wlags49_h2/wl_cs.h b/drivers/staging/wlags49_h2/wl_cs.h
index 21f17be4f02a..a7ab579759de 100644
--- a/drivers/staging/wlags49_h2/wl_cs.h
+++ b/drivers/staging/wlags49_h2/wl_cs.h
@@ -22,7 +22,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_enc.c b/drivers/staging/wlags49_h2/wl_enc.c
index 26cf5486edd6..4c6f776cc4da 100644
--- a/drivers/staging/wlags49_h2/wl_enc.c
+++ b/drivers/staging/wlags49_h2/wl_enc.c
@@ -23,7 +23,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -44,7 +44,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_enc.h b/drivers/staging/wlags49_h2/wl_enc.h
index b4f54d81f311..46629f3b112b 100644
--- a/drivers/staging/wlags49_h2/wl_enc.h
+++ b/drivers/staging/wlags49_h2/wl_enc.h
@@ -22,7 +22,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_if.h b/drivers/staging/wlags49_h2/wl_if.h
index ed2b4135a10e..6a26130f5a3a 100644
--- a/drivers/staging/wlags49_h2/wl_if.h
+++ b/drivers/staging/wlags49_h2/wl_if.h
@@ -23,7 +23,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -44,7 +44,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_internal.h b/drivers/staging/wlags49_h2/wl_internal.h
index 575340834051..553601f48873 100644
--- a/drivers/staging/wlags49_h2/wl_internal.h
+++ b/drivers/staging/wlags49_h2/wl_internal.h
@@ -22,7 +22,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_main.c b/drivers/staging/wlags49_h2/wl_main.c
index 483eee1bf63a..dab603e0f452 100644
--- a/drivers/staging/wlags49_h2/wl_main.c
+++ b/drivers/staging/wlags49_h2/wl_main.c
@@ -23,7 +23,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -44,7 +44,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_main.h b/drivers/staging/wlags49_h2/wl_main.h
index d593ae535fb8..3b5acdf4e329 100644
--- a/drivers/staging/wlags49_h2/wl_main.h
+++ b/drivers/staging/wlags49_h2/wl_main.h
@@ -22,7 +22,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_netdev.c b/drivers/staging/wlags49_h2/wl_netdev.c
index 5a2b334f206e..9c16f5478a75 100644
--- a/drivers/staging/wlags49_h2/wl_netdev.c
+++ b/drivers/staging/wlags49_h2/wl_netdev.c
@@ -23,7 +23,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -44,7 +44,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_netdev.h b/drivers/staging/wlags49_h2/wl_netdev.h
index 632ab2e6302c..61f040f26d97 100644
--- a/drivers/staging/wlags49_h2/wl_netdev.h
+++ b/drivers/staging/wlags49_h2/wl_netdev.h
@@ -23,7 +23,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -44,7 +44,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_pci.c b/drivers/staging/wlags49_h2/wl_pci.c
index 28ae9dd1b44e..1f1d98679171 100644
--- a/drivers/staging/wlags49_h2/wl_pci.c
+++ b/drivers/staging/wlags49_h2/wl_pci.c
@@ -23,7 +23,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -44,7 +44,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_pci.h b/drivers/staging/wlags49_h2/wl_pci.h
index cea04c44ec47..86831f1b4de2 100644
--- a/drivers/staging/wlags49_h2/wl_pci.h
+++ b/drivers/staging/wlags49_h2/wl_pci.h
@@ -22,7 +22,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_priv.c b/drivers/staging/wlags49_h2/wl_priv.c
index 260d4f0d47b4..f30e5ee4bca3 100644
--- a/drivers/staging/wlags49_h2/wl_priv.c
+++ b/drivers/staging/wlags49_h2/wl_priv.c
@@ -22,7 +22,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_priv.h b/drivers/staging/wlags49_h2/wl_priv.h
index 9b0254497aa7..b647bfd90098 100644
--- a/drivers/staging/wlags49_h2/wl_priv.h
+++ b/drivers/staging/wlags49_h2/wl_priv.h
@@ -22,7 +22,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_profile.c b/drivers/staging/wlags49_h2/wl_profile.c
index a459e48c7bf0..b8c96cf18de5 100644
--- a/drivers/staging/wlags49_h2/wl_profile.c
+++ b/drivers/staging/wlags49_h2/wl_profile.c
@@ -23,7 +23,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -44,7 +44,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_profile.h b/drivers/staging/wlags49_h2/wl_profile.h
index 81db8e8c6ba8..f81df51d2216 100644
--- a/drivers/staging/wlags49_h2/wl_profile.h
+++ b/drivers/staging/wlags49_h2/wl_profile.h
@@ -22,7 +22,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_util.c b/drivers/staging/wlags49_h2/wl_util.c
index 3b6f5a59b2bb..b748a3ff7954 100644
--- a/drivers/staging/wlags49_h2/wl_util.c
+++ b/drivers/staging/wlags49_h2/wl_util.c
@@ -22,7 +22,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_util.h b/drivers/staging/wlags49_h2/wl_util.h
index 2661bcd6b0ec..946b1b64c46f 100644
--- a/drivers/staging/wlags49_h2/wl_util.h
+++ b/drivers/staging/wlags49_h2/wl_util.h
@@ -23,7 +23,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -44,7 +44,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_version.h b/drivers/staging/wlags49_h2/wl_version.h
index fd37040afd01..3deacfac9d25 100644
--- a/drivers/staging/wlags49_h2/wl_version.h
+++ b/drivers/staging/wlags49_h2/wl_version.h
@@ -23,7 +23,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -44,7 +44,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_wext.c b/drivers/staging/wlags49_h2/wl_wext.c
index 8ac5e1081aa0..7ff0a108da13 100644
--- a/drivers/staging/wlags49_h2/wl_wext.c
+++ b/drivers/staging/wlags49_h2/wl_wext.c
@@ -18,7 +18,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -39,7 +39,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/staging/wlags49_h2/wl_wext.h b/drivers/staging/wlags49_h2/wl_wext.h
index a713058c8027..029da52c4c49 100644
--- a/drivers/staging/wlags49_h2/wl_wext.h
+++ b/drivers/staging/wlags49_h2/wl_wext.h
@@ -22,7 +22,7 @@
  * software indicates your acceptance of these terms and conditions.  If you do
  * not agree with these terms and conditions, do not use the software.
  *
- * Copyright � 2003 Agere Systems Inc.
+ * Copyright © 2003 Agere Systems Inc.
  * All rights reserved.
  *
  * Redistribution and use in source or binary forms, with or without
@@ -43,7 +43,7 @@
  *
  * Disclaimer
  *
- * THIS SOFTWARE IS PROVIDED �AS IS� AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * THIS SOFTWARE IS PROVIDED AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
  * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
diff --git a/drivers/usb/gadget/s3c2410_udc.c b/drivers/usb/gadget/s3c2410_udc.c
index b8643771fa80..fac4c650d4bb 100644
--- a/drivers/usb/gadget/s3c2410_udc.c
+++ b/drivers/usb/gadget/s3c2410_udc.c
@@ -3,7 +3,7 @@
  *
  * Samsung S3C24xx series on-chip full speed USB device controllers
  *
- * Copyright (C) 2004-2007 Herbert P�tzl - Arnaud Patard
+ * Copyright (C) 2004-2007 Herbert Pötzl - Arnaud Patard
  *	Additional cleanups by Ben Dooks <ben-linux@fluff.org>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -51,7 +51,7 @@
 
 #define DRIVER_DESC	"S3C2410 USB Device Controller Gadget"
 #define DRIVER_VERSION	"29 Apr 2007"
-#define DRIVER_AUTHOR	"Herbert P�tzl <herbert@13thfloor.at>, " \
+#define DRIVER_AUTHOR	"Herbert Pötzl <herbert@13thfloor.at>, " \
 			"Arnaud Patard <arnaud.patard@rtp-net.org>"
 
 static const char		gadget_name[] = "s3c2410_udc";
diff --git a/drivers/usb/gadget/s3c2410_udc.h b/drivers/usb/gadget/s3c2410_udc.h
index a48f619cb1cc..1653bae08b80 100644
--- a/drivers/usb/gadget/s3c2410_udc.h
+++ b/drivers/usb/gadget/s3c2410_udc.h
@@ -2,7 +2,7 @@
  * linux/drivers/usb/gadget/s3c2410_udc.h
  * Samsung on-chip full speed USB device controllers
  *
- * Copyright (C) 2004-2007 Herbert P�tzl - Arnaud Patard
+ * Copyright (C) 2004-2007 Herbert Pötzl - Arnaud Patard
  *	Additional cleanups by Ben Dooks <ben-linux@fluff.org>
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index 571fa96b49c7..e38fe48cce1b 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -2,7 +2,7 @@
  * vendor/product IDs (VID/PID) of devices using FTDI USB serial converters.
  * Please keep numerically sorted within individual areas, thanks!
  *
- * Philipp G�hring - pg@futureware.at - added the Device ID of the USB relais
+ * Philipp Gühring - pg@futureware.at - added the Device ID of the USB relais
  * from Rudolf Gugler
  *
  */
@@ -78,7 +78,7 @@
  */
 #define FTDI_ASK_RDR400_PID	0xC991	/* ASK RDR 400 series card reader */
 
-/* www.starting-point-systems.com �Chameleon device */
+/* www.starting-point-systems.com µChameleon device */
 #define FTDI_MICRO_CHAMELEON_PID	0xCAA0	/* Product Id */
 
 /*
@@ -290,7 +290,7 @@
 
 /*
  * Teratronik product ids.
- * Submitted by O. W�lfelschneider.
+ * Submitted by O. Wölfelschneider.
  */
 #define FTDI_TERATRONIK_VCP_PID	 0xEC88	/* Teratronik device (preferring VCP driver on windows) */
 #define FTDI_TERATRONIK_D2XX_PID 0xEC89	/* Teratronik device (preferring D2XX driver on windows) */
diff --git a/drivers/usb/storage/isd200.c b/drivers/usb/storage/isd200.c
index ffc4193e9505..7019273929af 100644
--- a/drivers/usb/storage/isd200.c
+++ b/drivers/usb/storage/isd200.c
@@ -61,7 +61,7 @@
 #include "scsiglue.h"
 
 MODULE_DESCRIPTION("Driver for In-System Design, Inc. ISD200 ASIC");
-MODULE_AUTHOR("Bj�rn Stenberg <bjorn@haxx.se>");
+MODULE_AUTHOR("Björn Stenberg <bjorn@haxx.se>");
 MODULE_LICENSE("GPL");
 
 static int isd200_Initialization(struct us_data *us);
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 3041a974faf3..50302b997bd5 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -1907,7 +1907,7 @@ UNUSUAL_DEV(  0x1b1c, 0x1ab5, 0x0200, 0x0200,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_INITIAL_READ10 ),
 
-/* Patch by Richard Sch�tz <r.schtz@t-online.de>
+/* Patch by Richard Schütz <r.schtz@t-online.de>
  * This external hard drive enclosure uses a JMicron chip which
  * needs the US_FL_IGNORE_RESIDUE flag to work properly. */
 UNUSUAL_DEV(  0x1e68, 0x001b, 0x0000, 0x0000,
diff --git a/drivers/video/omap/rfbi.c b/drivers/video/omap/rfbi.c
index 0c6981f1a4a3..2c1a3402bef0 100644
--- a/drivers/video/omap/rfbi.c
+++ b/drivers/video/omap/rfbi.c
@@ -2,7 +2,7 @@
  * OMAP2 Remote Frame Buffer Interface support
  *
  * Copyright (C) 2005 Nokia Corporation
- * Author: Juha Yrj�l� <juha.yrjola@nokia.com>
+ * Author: Juha Yrjölä <juha.yrjola@nokia.com>
  *	   Imre Deak <imre.deak@nokia.com>
  *
  * This program is free software; you can redistribute it and/or modify it
diff --git a/drivers/video/omap/sossi.c b/drivers/video/omap/sossi.c
index 8fb7c708f563..f79c137753d7 100644
--- a/drivers/video/omap/sossi.c
+++ b/drivers/video/omap/sossi.c
@@ -2,7 +2,7 @@
  * OMAP1 Special OptimiSed Screen Interface support
  *
  * Copyright (C) 2004-2005 Nokia Corporation
- * Author: Juha Yrj�l� <juha.yrjola@nokia.com>
+ * Author: Juha Yrjölä <juha.yrjola@nokia.com>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
diff --git a/drivers/watchdog/w83627hf_wdt.c b/drivers/watchdog/w83627hf_wdt.c
index dd5d67548758..576a388a1164 100644
--- a/drivers/watchdog/w83627hf_wdt.c
+++ b/drivers/watchdog/w83627hf_wdt.c
@@ -4,7 +4,7 @@
  *	(c) Copyright 2007 Vlad Drukker <vlad@storewiz.com>
  *		added support for W83627THF.
  *
- *	(c) Copyright 2003,2007 P�draig Brady <P@draigBrady.com>
+ *	(c) Copyright 2003,2007 Pádraig Brady <P@draigBrady.com>
  *
  *	Based on advantechwdt.c which is based on wdt.c.
  *	Original copyright messages:
@@ -401,6 +401,6 @@ module_init(wdt_init);
 module_exit(wdt_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("P�draig Brady <P@draigBrady.com>");
+MODULE_AUTHOR("Pádraig  Brady <P@draigBrady.com>");
 MODULE_DESCRIPTION("w83627hf/thf WDT driver");
 MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
diff --git a/drivers/zorro/zorro.ids b/drivers/zorro/zorro.ids
index de24e3decedd..119abea8c6cb 100644
--- a/drivers/zorro/zorro.ids
+++ b/drivers/zorro/zorro.ids
@@ -351,7 +351,7 @@
 	0200  EGS 28/24 Spectrum [Graphics Card]
 0892  Apollo
 	0100  A1200 [FPU and RAM Expansion]
-0893  Ingenieurb�ro Helfrich
+0893  Ingenieurbüro Helfrich
 	0500  Piccolo RAM [Graphics Card]
 	0600  Piccolo [Graphics Card]
 	0700  PeggyPlus MPEG [Video Card]
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0e8457da6f95..f641547beb76 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4,7 +4,7 @@
  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  *
  * For licensing details see kernel-base/COPYING
  */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a2a29205cc0f..809c8ec5d42a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -4,7 +4,7 @@
  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  *
  * For licensing details see kernel-base/COPYING
  */
diff --git a/tools/testing/ktest/sample.conf b/tools/testing/ktest/sample.conf
index dbedfa196727..553c06b7d6f2 100644
--- a/tools/testing/ktest/sample.conf
+++ b/tools/testing/ktest/sample.conf
@@ -950,7 +950,7 @@
 #   TEST_START
 #   TEST_TYPE = config_bisect
 #   CONFIG_BISECT_TYPE = build
-#   CONFIG_BISECT = /home/test/�onfig-bad
+#   CONFIG_BISECT = /home/test/config-bad
 #   MIN_CONFIG = /home/test/config-min
 #   BISECT_MANUAL = 1
 #
-- 
cgit v1.2.3-71-gd317


From 32dc730860155b235f13e0cd3fe58b263279baf9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 Dec 2011 20:08:42 -0500
Subject: get rid of timer in kern/acct.c

... and clean it up a bit, while we are at it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/acct.c | 40 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index fa7eb3de2ddc..8cba12429d82 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -84,26 +84,16 @@ static void do_acct_process(struct bsd_acct_struct *acct,
  * the cache line to have the data after getting the lock.
  */
 struct bsd_acct_struct {
-	volatile int		active;
-	volatile int		needcheck;
+	int			active;
+	unsigned long		needcheck;
 	struct file		*file;
 	struct pid_namespace	*ns;
-	struct timer_list	timer;
 	struct list_head	list;
 };
 
 static DEFINE_SPINLOCK(acct_lock);
 static LIST_HEAD(acct_list);
 
-/*
- * Called whenever the timer says to check the free space.
- */
-static void acct_timeout(unsigned long x)
-{
-	struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
-	acct->needcheck = 1;
-}
-
 /*
  * Check the amount of free space and suspend/resume accordingly.
  */
@@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 	struct kstatfs sbuf;
 	int res;
 	int act;
-	sector_t resume;
-	sector_t suspend;
+	u64 resume;
+	u64 suspend;
 
 	spin_lock(&acct_lock);
 	res = acct->active;
-	if (!file || !acct->needcheck)
+	if (!file || time_is_before_jiffies(acct->needcheck))
 		goto out;
 	spin_unlock(&acct_lock);
 
@@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 	suspend = sbuf.f_blocks * SUSPEND;
 	resume = sbuf.f_blocks * RESUME;
 
-	sector_div(suspend, 100);
-	sector_div(resume, 100);
+	do_div(suspend, 100);
+	do_div(resume, 100);
 
 	if (sbuf.f_bavail <= suspend)
 		act = -1;
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 		}
 	}
 
-	del_timer(&acct->timer);
-	acct->needcheck = 0;
-	acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-	add_timer(&acct->timer);
+	acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
 	res = acct->active;
 out:
 	spin_unlock(&acct_lock);
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
 	if (acct->file) {
 		old_acct = acct->file;
 		old_ns = acct->ns;
-		del_timer(&acct->timer);
 		acct->active = 0;
-		acct->needcheck = 0;
 		acct->file = NULL;
 		acct->ns = NULL;
 		list_del(&acct->list);
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
 	if (file) {
 		acct->file = file;
 		acct->ns = ns;
-		acct->needcheck = 0;
+		acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
 		acct->active = 1;
 		list_add(&acct->list, &acct_list);
-		/* It's been deleted if it was used before so this is safe */
-		setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
-		acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-		add_timer(&acct->timer);
 	}
 	if (old_acct) {
 		mnt_unpin(old_acct->f_path.mnt);
@@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns)
 	if (acct == NULL)
 		return;
 
-	del_timer_sync(&acct->timer);
 	spin_lock(&acct_lock);
 	if (acct->file != NULL)
 		acct_file_reopen(acct, NULL, NULL);
@@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	 * Fill the accounting struct with the needed info as recorded
 	 * by the different kernel functions.
 	 */
-	memset((caddr_t)&ac, 0, sizeof(acct_t));
+	memset(&ac, 0, sizeof(acct_t));
 
 	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
 	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
-- 
cgit v1.2.3-71-gd317


From ff01bb4832651c6d25ac509a06a10fcbd75c461c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 16 Sep 2011 02:31:11 -0400
Subject: fs: move code out of buffer.c

Move invalidate_bdev, block_sync_page into fs/block_dev.c.  Export
kill_bdev as well, so brd doesn't have to open code it.  Reduce
buffer_head.h requirement accordingly.

Removed a rather large comment from invalidate_bdev, as it looked a bit
obsolete to bother moving.  The small comment replacing it says enough.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/sysdev/axonram.c   |  1 -
 block/genhd.c                   |  1 -
 block/ioctl.c                   |  2 +-
 drivers/block/amiflop.c         |  2 +-
 drivers/block/brd.c             |  9 ++++----
 drivers/block/floppy.c          |  1 -
 drivers/block/loop.c            |  1 -
 drivers/cdrom/cdrom.c           |  1 -
 drivers/md/dm.c                 |  1 -
 drivers/md/md.c                 |  3 +--
 drivers/mtd/devices/block2mtd.c |  1 -
 drivers/s390/block/dasd.c       |  1 -
 drivers/scsi/scsicam.c          |  1 -
 drivers/tty/sysrq.c             |  2 +-
 fs/block_dev.c                  | 30 ++++++++++++++++++++++---
 fs/buffer.c                     | 50 -----------------------------------------
 fs/cachefiles/interface.c       |  1 -
 fs/cramfs/inode.c               |  1 -
 fs/fs-writeback.c               |  1 -
 fs/libfs.c                      |  2 +-
 fs/quota/dquot.c                |  1 -
 fs/quota/quota.c                |  1 -
 fs/splice.c                     |  1 -
 fs/sync.c                       |  1 -
 include/linux/fs.h              |  3 +++
 kernel/power/swap.c             |  1 -
 mm/page-writeback.c             |  2 +-
 mm/swap_state.c                 |  1 -
 28 files changed, 40 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index ba4271919062..1c16141c031c 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -25,7 +25,6 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/buffer_head.h>
 #include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/block/genhd.c b/block/genhd.c
index bf443a71b93e..b7d1a0e42686 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/kobj_map.h>
-#include <linux/buffer_head.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/log2.h>
diff --git a/block/ioctl.c b/block/ioctl.c
index ca939fc1030f..91e7b19c86f4 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,7 +5,7 @@
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
 #include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
+#include <linux/fs.h>
 #include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 8eba86bba599..386146d792d1 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -63,7 +63,7 @@
 #include <linux/mutex.h>
 #include <linux/amifdreg.h>
 #include <linux/amifd.h>
-#include <linux/buffer_head.h>
+#include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/interrupt.h>
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index d22119d49e53..ec246437f5a4 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -17,7 +17,7 @@
 #include <linux/highmem.h>
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
-#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
+#include <linux/fs.h>
 #include <linux/slab.h>
 
 #include <asm/uaccess.h>
@@ -402,14 +402,13 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
 	error = -EBUSY;
 	if (bdev->bd_openers <= 1) {
 		/*
-		 * Invalidate the cache first, so it isn't written
-		 * back to the device.
+		 * Kill the cache first, so it isn't written back to the
+		 * device.
 		 *
 		 * Another thread might instantiate more buffercache here,
 		 * but there is not much we can do to close that race.
 		 */
-		invalidate_bh_lrus();
-		truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+		kill_bdev(bdev);
 		brd_free_pages(brd);
 		error = 0;
 	}
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 9955a53733b2..510fb10ec45a 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -188,7 +188,6 @@ static int print_unex = 1;
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/mod_devicetable.h>
-#include <linux/buffer_head.h>	/* for invalidate_buffers() */
 #include <linux/mutex.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1e888c9e85b3..f00257782fcc 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -69,7 +69,6 @@
 #include <linux/freezer.h>
 #include <linux/mutex.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h>		/* for invalidate_bdev() */
 #include <linux/completion.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index f997c27d79e2..2118211aff99 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -267,7 +267,6 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/buffer_head.h>
 #include <linux/major.h>
 #include <linux/types.h>
 #include <linux/errno.h>
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4720f68f817e..b89c548ec3f8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -14,7 +14,6 @@
 #include <linux/moduleparam.h>
 #include <linux/blkpg.h>
 #include <linux/bio.h>
-#include <linux/buffer_head.h>
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f47f1f8ac44b..5d1b6762f108 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -36,8 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/sysctl.h>
 #include <linux/seq_file.h>
-#include <linux/mutex.h>
-#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/fs.h>
 #include <linux/poll.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index b78f23169d4e..ebeabc727f70 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -14,7 +14,6 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/mtd/mtd.h>
-#include <linux/buffer_head.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 65894f05a801..2de5b60ee8c8 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -17,7 +17,6 @@
 #include <linux/ctype.h>
 #include <linux/major.h>
 #include <linux/slab.h>
-#include <linux/buffer_head.h>
 #include <linux/hdreg.h>
 #include <linux/async.h>
 #include <linux/mutex.h>
diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c
index 6803b1e26ecc..92d24d6dcb39 100644
--- a/drivers/scsi/scsicam.c
+++ b/drivers/scsi/scsicam.c
@@ -16,7 +16,6 @@
 #include <linux/genhd.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
-#include <linux/buffer_head.h>
 #include <asm/unaligned.h>
 
 #include <scsi/scsicam.h>
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 43db715f1502..7867b7c4538e 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -32,7 +32,6 @@
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h>		/* for fsync_bdev() */
 #include <linux/swap.h>
 #include <linux/spinlock.h>
 #include <linux/vt_kern.h>
@@ -41,6 +40,7 @@
 #include <linux/oom.h>
 #include <linux/slab.h>
 #include <linux/input.h>
+#include <linux/uaccess.h>
 
 #include <asm/ptrace.h>
 #include <asm/irq_regs.h>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7866cdd9fe70..69a5b6fbee2b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/buffer_head.h>
+#include <linux/swap.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 #include <linux/mpage.h>
@@ -25,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/kmemleak.h>
+#include <linux/cleancache.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -82,13 +84,35 @@ static sector_t max_block(struct block_device *bdev)
 }
 
 /* Kill _all_ buffers and pagecache , dirty or not.. */
-static void kill_bdev(struct block_device *bdev)
+void kill_bdev(struct block_device *bdev)
 {
-	if (bdev->bd_inode->i_mapping->nrpages == 0)
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+	if (mapping->nrpages == 0)
 		return;
+
 	invalidate_bh_lrus();
-	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+	truncate_inode_pages(mapping, 0);
 }	
+EXPORT_SYMBOL(kill_bdev);
+
+/* Invalidate clean unused buffers and pagecache. */
+void invalidate_bdev(struct block_device *bdev)
+{
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+	if (mapping->nrpages == 0)
+		return;
+
+	invalidate_bh_lrus();
+	lru_add_drain_all();	/* make sure all lru add caches are flushed */
+	invalidate_mapping_pages(mapping, 0, -1);
+	/* 99% of the time, we don't need to flush the cleancache on the bdev.
+	 * But, for the strange corners, lets be cautious
+	 */
+	cleancache_flush_inode(mapping);
+}
+EXPORT_SYMBOL(invalidate_bdev);
 
 int set_blocksize(struct block_device *bdev, int size)
 {
diff --git a/fs/buffer.c b/fs/buffer.c
index 19d8eb7fdc81..1a30db77af32 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,7 +41,6 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
-#include <linux/cleancache.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
@@ -231,55 +230,6 @@ out:
 	return ret;
 }
 
-/* If invalidate_buffers() will trash dirty buffers, it means some kind
-   of fs corruption is going on. Trashing dirty data always imply losing
-   information that was supposed to be just stored on the physical layer
-   by the user.
-
-   Thus invalidate_buffers in general usage is not allwowed to trash
-   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
-   be preserved.  These buffers are simply skipped.
-  
-   We also skip buffers which are still in use.  For example this can
-   happen if a userspace program is reading the block device.
-
-   NOTE: In the case where the user removed a removable-media-disk even if
-   there's still dirty data not synced on disk (due a bug in the device driver
-   or due an error of the user), by not destroying the dirty buffers we could
-   generate corruption also on the next media inserted, thus a parameter is
-   necessary to handle this case in the most safe way possible (trying
-   to not corrupt also the new disk inserted with the data belonging to
-   the old now corrupted disk). Also for the ramdisk the natural thing
-   to do in order to release the ramdisk memory is to destroy dirty buffers.
-
-   These are two special cases. Normal usage imply the device driver
-   to issue a sync on the device (without waiting I/O completion) and
-   then an invalidate_buffers call that doesn't trash dirty buffers.
-
-   For handling cache coherency with the blkdev pagecache the 'update' case
-   is been introduced. It is needed to re-read from disk any pinned
-   buffer. NOTE: re-reading from disk is destructive so we can do it only
-   when we assume nobody is changing the buffercache under our I/O and when
-   we think the disk contains more recent information than the buffercache.
-   The update == 1 pass marks the buffers we need to update, the update == 2
-   pass does the actual I/O. */
-void invalidate_bdev(struct block_device *bdev)
-{
-	struct address_space *mapping = bdev->bd_inode->i_mapping;
-
-	if (mapping->nrpages == 0)
-		return;
-
-	invalidate_bh_lrus();
-	lru_add_drain_all();	/* make sure all lru add caches are flushed */
-	invalidate_mapping_pages(mapping, 0, -1);
-	/* 99% of the time, we don't need to flush the cleancache on the bdev.
-	 * But, for the strange corners, lets be cautious
-	 */
-	cleancache_flush_inode(mapping);
-}
-EXPORT_SYMBOL(invalidate_bdev);
-
 /*
  * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
  */
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1064805e653b..67bef6d01484 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -11,7 +11,6 @@
 
 #include <linux/slab.h>
 #include <linux/mount.h>
-#include <linux/buffer_head.h>
 #include "internal.h"
 
 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 739fb59bcdc2..c37adb222113 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -20,7 +20,6 @@
 #include <linux/cramfs_fs.h>
 #include <linux/slab.h>
 #include <linux/cramfs_fs_sb.h>
-#include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 517f211a3bd4..80a4574028f1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -25,7 +25,6 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
 #include <linux/tracepoint.h>
 #include "internal.h"
 
diff --git a/fs/libfs.c b/fs/libfs.c
index f6d411eef1e7..5b2dbb3ba4fc 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -12,7 +12,7 @@
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h>
+#include <linux/buffer_head.h> /* sync_mapping_buffers */
 
 #include <asm/uaccess.h>
 
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 5b572c89e6c4..5d81e92daf83 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -73,7 +73,6 @@
 #include <linux/security.h>
 #include <linux/kmod.h>
 #include <linux/namei.h>
-#include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include "../internal.h" /* ugh */
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 35f4b0ecdeb3..7898cd688a00 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -13,7 +13,6 @@
 #include <linux/kernel.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
-#include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/types.h>
diff --git a/fs/splice.c b/fs/splice.c
index fa2defa8afcf..1ec0493266b3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -25,7 +25,6 @@
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/uio.h>
diff --git a/fs/sync.c b/fs/sync.c
index 101b8ef901d7..f3501ef39235 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -14,7 +14,6 @@
 #include <linux/linkage.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
-#include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
 #include "internal.h"
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cec429d76ab0..e853ba5eddd4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2092,6 +2092,7 @@ extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
+extern void kill_bdev(struct block_device *);
 extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
@@ -2099,6 +2100,7 @@ extern int fsync_bdev(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
+static inline void kill_bdev(struct block_device *bdev) {}
 static inline void invalidate_bdev(struct block_device *bdev) {}
 
 static inline struct super_block *freeze_bdev(struct block_device *sb)
@@ -2415,6 +2417,7 @@ extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos);
 extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
 			int datasync);
+extern void block_sync_page(struct page *page);
 
 /* fs/splice.c */
 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11a594c4ba25..3739ecced085 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -18,7 +18,6 @@
 #include <linux/bitops.h>
 #include <linux/genhd.h>
 #include <linux/device.h>
-#include <linux/buffer_head.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 50f08241f981..8616ef3025a4 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -32,7 +32,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
-#include <linux/buffer_head.h>
+#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
 #include <linux/pagevec.h>
 #include <trace/events/writeback.h>
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 78cc4d1f6cce..ea6b32d61873 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -13,7 +13,6 @@
 #include <linux/swapops.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
-#include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
-- 
cgit v1.2.3-71-gd317


From 18bb1db3e7607e4a997d50991a6f9fa5b0f8722c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 26 Jul 2011 01:41:39 -0400
Subject: switch vfs_mkdir() and ->mkdir() to umode_t

vfs_mkdir() gets int, but immediately drops everything that might not
fit into umode_t and that's the only caller of ->mkdir()...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking | 2 +-
 Documentation/filesystems/vfs.txt | 2 +-
 drivers/staging/pohmelfs/dir.c    | 2 +-
 fs/9p/vfs_inode.c                 | 2 +-
 fs/9p/vfs_inode_dotl.c            | 2 +-
 fs/affs/affs.h                    | 2 +-
 fs/affs/namei.c                   | 4 ++--
 fs/afs/dir.c                      | 6 +++---
 fs/autofs4/root.c                 | 4 ++--
 fs/bad_inode.c                    | 2 +-
 fs/btrfs/inode.c                  | 2 +-
 fs/ceph/dir.c                     | 4 ++--
 fs/cifs/cifsfs.h                  | 2 +-
 fs/cifs/inode.c                   | 4 ++--
 fs/coda/dir.c                     | 4 ++--
 fs/configfs/dir.c                 | 2 +-
 fs/ecryptfs/inode.c               | 2 +-
 fs/exofs/namei.c                  | 2 +-
 fs/ext2/namei.c                   | 2 +-
 fs/ext3/namei.c                   | 2 +-
 fs/ext4/namei.c                   | 2 +-
 fs/fat/namei_msdos.c              | 2 +-
 fs/fat/namei_vfat.c               | 2 +-
 fs/fuse/dir.c                     | 2 +-
 fs/gfs2/inode.c                   | 2 +-
 fs/hfs/dir.c                      | 2 +-
 fs/hfsplus/dir.c                  | 2 +-
 fs/hostfs/hostfs_kern.c           | 2 +-
 fs/hpfs/namei.c                   | 2 +-
 fs/hugetlbfs/inode.c              | 2 +-
 fs/jffs2/dir.c                    | 4 ++--
 fs/jfs/namei.c                    | 2 +-
 fs/logfs/dir.c                    | 2 +-
 fs/minix/namei.c                  | 2 +-
 fs/namei.c                        | 2 +-
 fs/ncpfs/dir.c                    | 4 ++--
 fs/nfs/dir.c                      | 4 ++--
 fs/nilfs2/namei.c                 | 2 +-
 fs/ocfs2/dlmfs/dlmfs.c            | 2 +-
 fs/ocfs2/namei.c                  | 2 +-
 fs/omfs/dir.c                     | 2 +-
 fs/ramfs/inode.c                  | 2 +-
 fs/reiserfs/namei.c               | 2 +-
 fs/reiserfs/xattr.c               | 2 +-
 fs/sysv/namei.c                   | 2 +-
 fs/ubifs/dir.c                    | 4 ++--
 fs/udf/namei.c                    | 2 +-
 fs/ufs/namei.c                    | 2 +-
 fs/xfs/xfs_iops.c                 | 2 +-
 include/linux/fs.h                | 4 ++--
 include/linux/security.h          | 4 ++--
 kernel/cgroup.c                   | 4 ++--
 mm/shmem.c                        | 2 +-
 security/capability.c             | 2 +-
 security/security.c               | 2 +-
 security/selinux/hooks.c          | 2 +-
 56 files changed, 70 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index d819ba16a0c7..6c7676d9c0ea 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -43,7 +43,7 @@ ata *);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
 	int (*unlink) (struct inode *,struct dentry *);
 	int (*symlink) (struct inode *,struct dentry *,const char *);
-	int (*mkdir) (struct inode *,struct dentry *,int);
+	int (*mkdir) (struct inode *,struct dentry *,umode_t);
 	int (*rmdir) (struct inode *,struct dentry *);
 	int (*mknod) (struct inode *,struct dentry *,int,dev_t);
 	int (*rename) (struct inode *, struct dentry *,
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 43cbd0821721..0c147c79cdd8 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -346,7 +346,7 @@ struct inode_operations {
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
 	int (*unlink) (struct inode *,struct dentry *);
 	int (*symlink) (struct inode *,struct dentry *,const char *);
-	int (*mkdir) (struct inode *,struct dentry *,int);
+	int (*mkdir) (struct inode *,struct dentry *,umode_t);
 	int (*rmdir) (struct inode *,struct dentry *);
 	int (*mknod) (struct inode *,struct dentry *,int,dev_t);
 	int (*rename) (struct inode *, struct dentry *,
diff --git a/drivers/staging/pohmelfs/dir.c b/drivers/staging/pohmelfs/dir.c
index 7598e77672a5..d3ad4dde991f 100644
--- a/drivers/staging/pohmelfs/dir.c
+++ b/drivers/staging/pohmelfs/dir.c
@@ -667,7 +667,7 @@ static int pohmelfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	return pohmelfs_create_entry(dir, dentry, 0, mode);
 }
 
-static int pohmelfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int pohmelfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int err;
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2310cc9eb402..3e54900f3b7e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -785,7 +785,7 @@ error:
  *
  */
 
-static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int err;
 	u32 perm;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 0b5745e21946..87e46b19b21b 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -395,7 +395,7 @@ err_clunk_old_fid:
  */
 
 static int v9fs_vfs_mkdir_dotl(struct inode *dir,
-			       struct dentry *dentry, int omode)
+			       struct dentry *dentry, umode_t omode)
 {
 	int err;
 	struct v9fs_session_info *v9ses;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index c2b9c79eb64e..8abcad7c935f 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -157,7 +157,7 @@ extern int	affs_hash_name(struct super_block *sb, const u8 *name, unsigned int l
 extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
 extern int	affs_unlink(struct inode *dir, struct dentry *dentry);
 extern int	affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *);
-extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 extern int	affs_rmdir(struct inode *dir, struct dentry *dentry);
 extern int	affs_link(struct dentry *olddentry, struct inode *dir,
 			  struct dentry *dentry);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 780a11dc6318..7bb7660f805d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -285,12 +285,12 @@ affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata
 }
 
 int
-affs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode		*inode;
 	int			 error;
 
-	pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%o)\n",dir->i_ino,
+	pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%ho)\n",dir->i_ino,
 		 (int)dentry->d_name.len,dentry->d_name.name,mode);
 
 	inode = affs_new_inode(dir);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 1b0b19550015..e6ea58abde3b 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -30,7 +30,7 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 				  loff_t fpos, u64 ino, unsigned dtype);
 static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
 		      struct nameidata *nd);
-static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 static int afs_rmdir(struct inode *dir, struct dentry *dentry);
 static int afs_unlink(struct inode *dir, struct dentry *dentry);
 static int afs_link(struct dentry *from, struct inode *dir,
@@ -764,7 +764,7 @@ static void afs_d_release(struct dentry *dentry)
 /*
  * create a directory on an AFS filesystem
  */
-static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct afs_file_status status;
 	struct afs_callback cb;
@@ -777,7 +777,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	dvnode = AFS_FS_I(dir);
 
-	_enter("{%x:%u},{%s},%o",
+	_enter("{%x:%u},{%s},%ho",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
 
 	ret = -ENAMETOOLONG;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f55ae23b137e..75e5f1c8e028 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -26,7 +26,7 @@
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
-static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
+static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
 static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
 #ifdef CONFIG_COMPAT
 static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
@@ -699,7 +699,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	return 0;
 }
 
-static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 9205cf25f1c6..5a2738c1f315 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -202,7 +202,7 @@ static int bad_inode_symlink (struct inode *dir, struct dentry *dentry,
 }
 
 static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry,
-			int mode)
+			umode_t mode)
 {
 	return -EIO;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f8ff9738558a..e30de56e6b62 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4792,7 +4792,7 @@ fail:
 	return err;
 }
 
-static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode = NULL;
 	struct btrfs_trans_handle *trans;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 98954003a8d3..96141ae3d8be 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -753,7 +753,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 	return err;
 }
 
-static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -767,7 +767,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
 		     dentry->d_name.len, dentry->d_name.name, dentry);
 	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
-		dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
 		op = CEPH_MDS_OP_MKDIR;
 	} else {
 		goto out;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 30ff56005d8f..add64454fd51 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -51,7 +51,7 @@ extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
 extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
 extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
 extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
-extern int cifs_mkdir(struct inode *, struct dentry *, int);
+extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
 extern int cifs_rmdir(struct inode *, struct dentry *);
 extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
 		       struct dentry *);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index e851d5b8931e..a5f54b7d9822 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1264,7 +1264,7 @@ unlink_out:
 	return rc;
 }
 
-int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
+int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
 {
 	int rc = 0, tmprc;
 	int xid;
@@ -1275,7 +1275,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	struct inode *newinode = NULL;
 	struct cifs_fattr fattr;
 
-	cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
+	cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode);
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 28e7e135cfab..a74ae6fcfb7e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -37,7 +37,7 @@ static int coda_link(struct dentry *old_dentry, struct inode *dir_inode,
 static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
 static int coda_symlink(struct inode *dir_inode, struct dentry *entry,
 			const char *symname);
-static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, int mode);
+static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode);
 static int coda_rmdir(struct inode *dir_inode, struct dentry *entry);
 static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, 
                        struct inode *new_inode, struct dentry *new_dentry);
@@ -223,7 +223,7 @@ err_out:
 	return error;
 }
 
-static int coda_mkdir(struct inode *dir, struct dentry *de, int mode)
+static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode)
 {
 	struct inode *inode;
 	struct coda_vattr attrs;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 1c5296911104..5ddd7ebd9dcd 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1170,7 +1170,7 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
 }
 EXPORT_SYMBOL(configfs_undepend_item);
 
-static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int ret = 0;
 	int module_got = 0;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 32f90a3ae63e..ebf8726482b6 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -559,7 +559,7 @@ out_lock:
 	return rc;
 }
 
-static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int rc;
 	struct dentry *lower_dentry;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b54c43775f17..ff1c8286cd69 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,
 	return exofs_add_nondir(dentry, inode);
 }
 
-static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	int err = -EMLINK;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 761fde807fc9..e3f3672b2020 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -214,7 +214,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 	return err;
 }
 
-static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err = -EMLINK;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 642dc6d66dfd..08ecb53a33ea 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1768,7 +1768,7 @@ retry:
 	return err;
 }
 
-static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
 {
 	handle_t *handle;
 	struct inode * inode;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index aa4c782c9dd7..e506746724cf 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1806,7 +1806,7 @@ retry:
 	return err;
 }
 
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	handle_t *handle;
 	struct inode *inode;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 216b419f30e2..d1f53cae897c 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -346,7 +346,7 @@ out:
 }
 
 /***** Make a directory */
-static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a87a65663c25..fde2eda6332e 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -870,7 +870,7 @@ out:
 	return err;
 }
 
-static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9f63e493a9b6..4848a1acb3bb 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -585,7 +585,7 @@ static int fuse_create(struct inode *dir, struct dentry *entry, int mode,
 	return fuse_mknod(dir, entry, mode, 0);
 }
 
-static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
+static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
 {
 	struct fuse_mkdir_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index cfd4959b218c..eecfc39c07e6 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1129,7 +1129,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
  * Returns: errno
  */
 
-static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0);
 }
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index bce4eef91a06..06dc161e911c 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -216,7 +216,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, int mode,
  * in a directory, given the inode for the parent directory and the
  * name (and its length) of the new directory.
  */
-static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	int res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4536cd3f15ae..ed321f0384d7 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -459,7 +459,7 @@ static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
 	return hfsplus_mknod(dir, dentry, mode, 0);
 }
 
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 343ea632b97c..d35240fbbd73 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -676,7 +676,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
 	return err;
 }
 
-int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
+int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
 {
 	char *file;
 	int err;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ea91fcb0ef9b..a2f89f2b9503 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -8,7 +8,7 @@
 #include <linux/sched.h>
 #include "hpfs_fn.h"
 
-static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9c4ec538725b..ba269706e798 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -523,7 +523,7 @@ static int hugetlbfs_mknod(struct inode *dir,
 	return error;
 }
 
-static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index be6169bd8acd..5dc458f19bc9 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -29,7 +29,7 @@ static struct dentry *jffs2_lookup (struct inode *,struct dentry *,
 static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
 static int jffs2_unlink (struct inode *,struct dentry *);
 static int jffs2_symlink (struct inode *,struct dentry *,const char *);
-static int jffs2_mkdir (struct inode *,struct dentry *,int);
+static int jffs2_mkdir (struct inode *,struct dentry *,umode_t);
 static int jffs2_rmdir (struct inode *,struct dentry *);
 static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t);
 static int jffs2_rename (struct inode *, struct dentry *,
@@ -450,7 +450,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 }
 
 
-static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
+static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode)
 {
 	struct jffs2_inode_info *f, *dir_f;
 	struct jffs2_sb_info *c;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a112ad96e474..17ea85835715 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -205,7 +205,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
  * note:
  * EACCESS: user needs search+write permission on the parent directory
  */
-static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
+static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
 {
 	int rc = 0;
 	tid_t tid;		/* transaction id */
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b7d7f67cee5a..25c5cbf8c123 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -482,7 +482,7 @@ out:
 	return ret;
 }
 
-static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 6e6777f1b4b2..0e7a1a22e554 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -103,7 +103,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
 	return add_nondir(dentry, inode);
 }
 
-static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
+static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err = -EMLINK;
diff --git a/fs/namei.c b/fs/namei.c
index f6b3c73e862c..443c703249b3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2541,7 +2541,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
 	return sys_mknodat(AT_FDCWD, filename, mode, dev);
 }
 
-int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int error = may_create(dir, dentry);
 
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9c51f621e901..dfb51f084407 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -33,7 +33,7 @@ static int ncp_readdir(struct file *, void *, filldir_t);
 static int ncp_create(struct inode *, struct dentry *, int, struct nameidata *);
 static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int ncp_unlink(struct inode *, struct dentry *);
-static int ncp_mkdir(struct inode *, struct dentry *, int);
+static int ncp_mkdir(struct inode *, struct dentry *, umode_t);
 static int ncp_rmdir(struct inode *, struct dentry *);
 static int ncp_rename(struct inode *, struct dentry *,
 	  	      struct inode *, struct dentry *);
@@ -985,7 +985,7 @@ static int ncp_create(struct inode *dir, struct dentry *dentry, int mode,
 	return ncp_create_new(dir, dentry, mode, 0, 0);
 }
 
-static int ncp_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct ncp_entry_info finfo;
 	struct ncp_server *server = NCP_SERVER(dir);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 23be134b3193..5d67d92a4248 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -48,7 +48,7 @@ static int nfs_closedir(struct inode *, struct file *);
 static int nfs_readdir(struct file *, void *, filldir_t);
 static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *);
-static int nfs_mkdir(struct inode *, struct dentry *, int);
+static int nfs_mkdir(struct inode *, struct dentry *, umode_t);
 static int nfs_rmdir(struct inode *, struct dentry *);
 static int nfs_unlink(struct inode *, struct dentry *);
 static int nfs_symlink(struct inode *, struct dentry *, const char *);
@@ -1719,7 +1719,7 @@ out_err:
 /*
  * See comments for nfs_proc_create regarding failed operations.
  */
-static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct iattr attr;
 	int error;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 768982de10e4..e5e7311f1b92 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -213,7 +213,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
 	return err;
 }
 
-static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	struct nilfs_transaction_info ti;
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index a9f007de1da8..77c8d8069461 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -488,7 +488,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 /* SMP-safe */
 static int dlmfs_mkdir(struct inode * dir,
 		       struct dentry * dentry,
-		       int mode)
+		       umode_t mode)
 {
 	int status;
 	struct inode *inode = NULL;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a8b2bfea574e..c779f8bfc8a6 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -602,7 +602,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 
 static int ocfs2_mkdir(struct inode *dir,
 		       struct dentry *dentry,
-		       int mode)
+		       umode_t mode)
 {
 	int ret;
 
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 98e544274390..667dc7ff28c0 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -279,7 +279,7 @@ out_free_inode:
 	return err;
 }
 
-static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	return omfs_add_node(dir, dentry, mode | S_IFDIR);
 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 462ceb38fec6..61972bee0561 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -106,7 +106,7 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	return error;
 }
 
-static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
 {
 	int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 80058e8ce361..763239a7e8dd 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -721,7 +721,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	return retval;
 }
 
-static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int retval;
 	struct inode *inode;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6bc346c160e7..c24deda8a8bc 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -66,7 +66,7 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 }
 #endif
 
-static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	BUG_ON(!mutex_is_locked(&dir->i_mutex));
 	return dir->i_op->mkdir(dir, dentry, mode);
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e474fbcf8bde..3368425a4ce2 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -131,7 +131,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
 	return add_nondir(dentry, inode);
 }
 
-static int sysv_mkdir(struct inode * dir, struct dentry *dentry, int mode)
+static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err = -EMLINK;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 683492043317..f5102f368160 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -712,7 +712,7 @@ out_cancel:
 	return err;
 }
 
-static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
@@ -725,7 +725,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	 * directory inode.
 	 */
 
-	dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+	dbg_gen("dent '%.*s', mode %#hx in dir ino %lu",
 		dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
 
 	err = ubifs_budget_space(c, &req);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 4639e137222f..7f8ee32842be 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -640,7 +640,7 @@ out:
 	return err;
 }
 
-static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 	struct udf_fileident_bh fibh;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 639d49162241..fa743aaa327c 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
 	return error;
 }
 
-static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
 {
 	struct inode * inode;
 	int err = -EMLINK;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 23ce927973a4..99b324d43c98 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -241,7 +241,7 @@ STATIC int
 xfs_vn_mkdir(
 	struct inode	*dir,
 	struct dentry	*dentry,
-	int		mode)
+	umode_t		mode)
 {
 	return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cec429d76ab0..3f7bd8b12e37 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1517,7 +1517,7 @@ extern void unlock_super(struct super_block *);
  * VFS helper functions..
  */
 extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *);
-extern int vfs_mkdir(struct inode *, struct dentry *, int);
+extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
 extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
 extern int vfs_symlink(struct inode *, struct dentry *, const char *);
 extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
@@ -1623,7 +1623,7 @@ struct inode_operations {
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
 	int (*unlink) (struct inode *,struct dentry *);
 	int (*symlink) (struct inode *,struct dentry *,const char *);
-	int (*mkdir) (struct inode *,struct dentry *,int);
+	int (*mkdir) (struct inode *,struct dentry *,umode_t);
 	int (*rmdir) (struct inode *,struct dentry *);
 	int (*mknod) (struct inode *,struct dentry *,int,dev_t);
 	int (*rename) (struct inode *, struct dentry *,
diff --git a/include/linux/security.h b/include/linux/security.h
index e8c619d39291..16cbc58cb13b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1453,7 +1453,7 @@ struct security_operations {
 	int (*inode_unlink) (struct inode *dir, struct dentry *dentry);
 	int (*inode_symlink) (struct inode *dir,
 			      struct dentry *dentry, const char *old_name);
-	int (*inode_mkdir) (struct inode *dir, struct dentry *dentry, int mode);
+	int (*inode_mkdir) (struct inode *dir, struct dentry *dentry, umode_t mode);
 	int (*inode_rmdir) (struct inode *dir, struct dentry *dentry);
 	int (*inode_mknod) (struct inode *dir, struct dentry *dentry,
 			    int mode, dev_t dev);
@@ -1722,7 +1722,7 @@ int security_inode_link(struct dentry *old_dentry, struct inode *dir,
 int security_inode_unlink(struct inode *dir, struct dentry *dentry);
 int security_inode_symlink(struct inode *dir, struct dentry *dentry,
 			   const char *old_name);
-int security_inode_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 int security_inode_rmdir(struct inode *dir, struct dentry *dentry);
 int security_inode_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev);
 int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a184470cf9b5..b37a0ea55114 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -760,7 +760,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
  * -> cgroup_mkdir.
  */
 
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
@@ -3846,7 +3846,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	return err;
 }
 
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index c58594c06569..b8a8ddf069d0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1489,7 +1489,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	return error;
 }
 
-static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int error;
 
diff --git a/security/capability.c b/security/capability.c
index 2984ea4f776f..ddd17892826a 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -148,7 +148,7 @@ static int cap_inode_symlink(struct inode *inode, struct dentry *dentry,
 }
 
 static int cap_inode_mkdir(struct inode *inode, struct dentry *dentry,
-			   int mask)
+			   umode_t mask)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index e2f684aeb70c..be49eb5768bc 100644
--- a/security/security.c
+++ b/security/security.c
@@ -506,7 +506,7 @@ int security_inode_symlink(struct inode *dir, struct dentry *dentry,
 	return security_ops->inode_symlink(dir, dentry, old_name);
 }
 
-int security_inode_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	if (unlikely(IS_PRIVATE(dir)))
 		return 0;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 1126c10a5e82..ad74ad24ce2a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2618,7 +2618,7 @@ static int selinux_inode_symlink(struct inode *dir, struct dentry *dentry, const
 	return may_create(dir, dentry, SECCLASS_LNK_FILE);
 }
 
-static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, int mask)
+static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mask)
 {
 	return may_create(dir, dentry, SECCLASS_DIR);
 }
-- 
cgit v1.2.3-71-gd317


From f4ae40a6a50a98ac23d4b285f739455e926a473e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Jul 2011 04:33:43 -0400
Subject: switch debugfs to umode_t

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/debugfs.txt              | 24 +++++------
 arch/arm/mach-msm/smd_debug.c                      |  2 +-
 arch/s390/include/asm/debug.h                      |  4 +-
 arch/s390/kernel/debug.c                           |  8 ++--
 arch/x86/xen/debugfs.c                             |  2 +-
 arch/x86/xen/debugfs.h                             |  2 +-
 drivers/acpi/ec_sys.c                              |  2 +-
 drivers/mmc/card/mmc_test.c                        |  2 +-
 .../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c    |  2 +-
 drivers/net/wireless/ath/carl9170/debug.c          |  2 +-
 drivers/net/wireless/libertas/debugfs.c            |  2 +-
 drivers/s390/block/dasd.c                          |  4 +-
 drivers/scsi/bfa/bfad_debugfs.c                    |  2 +-
 fs/debugfs/file.c                                  | 22 +++++------
 fs/debugfs/inode.c                                 | 14 +++----
 fs/ocfs2/cluster/netdebug.c                        |  2 +-
 include/linux/debugfs.h                            | 46 +++++++++++-----------
 include/linux/relay.h                              |  2 +-
 kernel/relay.c                                     |  2 +-
 kernel/trace/blktrace.c                            |  2 +-
 kernel/trace/trace.c                               |  2 +-
 kernel/trace/trace.h                               |  2 +-
 lib/fault-inject.c                                 |  8 ++--
 mm/failslab.c                                      |  2 +-
 mm/page_alloc.c                                    |  2 +-
 25 files changed, 82 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 742cc06e138f..9281a95d689f 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -35,7 +35,7 @@ described below will work.
 
 The most general way to create a file within a debugfs directory is with:
 
-    struct dentry *debugfs_create_file(const char *name, mode_t mode,
+    struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				       struct dentry *parent, void *data,
 				       const struct file_operations *fops);
 
@@ -53,13 +53,13 @@ actually necessary; the debugfs code provides a number of helper functions
 for simple situations.  Files containing a single integer value can be
 created with any of:
 
-    struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+    struct dentry *debugfs_create_u8(const char *name, umode_t mode,
 				     struct dentry *parent, u8 *value);
-    struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+    struct dentry *debugfs_create_u16(const char *name, umode_t mode,
 				      struct dentry *parent, u16 *value);
-    struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+    struct dentry *debugfs_create_u32(const char *name, umode_t mode,
 				      struct dentry *parent, u32 *value);
-    struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+    struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 				      struct dentry *parent, u64 *value);
 
 These files support both reading and writing the given value; if a specific
@@ -67,13 +67,13 @@ file should not be written to, simply set the mode bits accordingly.  The
 values in these files are in decimal; if hexadecimal is more appropriate,
 the following functions can be used instead:
 
-    struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+    struct dentry *debugfs_create_x8(const char *name, umode_t mode,
 				     struct dentry *parent, u8 *value);
-    struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+    struct dentry *debugfs_create_x16(const char *name, umode_t mode,
 				      struct dentry *parent, u16 *value);
-    struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+    struct dentry *debugfs_create_x32(const char *name, umode_t mode,
 				      struct dentry *parent, u32 *value);
-    struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+    struct dentry *debugfs_create_x64(const char *name, umode_t mode,
 				      struct dentry *parent, u64 *value);
 
 These functions are useful as long as the developer knows the size of the
@@ -81,7 +81,7 @@ value to be exported.  Some types can have different widths on different
 architectures, though, complicating the situation somewhat.  There is a
 function meant to help out in one special case:
 
-    struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+    struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 				         struct dentry *parent, 
 					 size_t *value);
 
@@ -90,7 +90,7 @@ a variable of type size_t.
 
 Boolean values can be placed in debugfs with:
 
-    struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+    struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 				       struct dentry *parent, u32 *value);
 
 A read on the resulting file will yield either Y (for non-zero values) or
@@ -104,7 +104,7 @@ Finally, a block of arbitrary binary data can be exported with:
 	unsigned long size;
     };
 
-    struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+    struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				       struct dentry *parent,
 				       struct debugfs_blob_wrapper *blob);
 
diff --git a/arch/arm/mach-msm/smd_debug.c b/arch/arm/mach-msm/smd_debug.c
index 8736afff82f3..0c56a5aaf588 100644
--- a/arch/arm/mach-msm/smd_debug.c
+++ b/arch/arm/mach-msm/smd_debug.c
@@ -215,7 +215,7 @@ static const struct file_operations debug_ops = {
 	.llseek = default_llseek,
 };
 
-static void debug_create(const char *name, mode_t mode,
+static void debug_create(const char *name, umode_t mode,
 			 struct dentry *dent,
 			 int (*fill)(char *buf, int max))
 {
diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h
index 18124b75a7ab..9d88db1f55d0 100644
--- a/arch/s390/include/asm/debug.h
+++ b/arch/s390/include/asm/debug.h
@@ -73,7 +73,7 @@ typedef struct debug_info {
 	struct dentry* debugfs_entries[DEBUG_MAX_VIEWS];
 	struct debug_view* views[DEBUG_MAX_VIEWS];	
 	char name[DEBUG_MAX_NAME_LEN];
-	mode_t mode;
+	umode_t mode;
 } debug_info_t;
 
 typedef int (debug_header_proc_t) (debug_info_t* id,
@@ -124,7 +124,7 @@ debug_info_t *debug_register(const char *name, int pages, int nr_areas,
                              int buf_size);
 
 debug_info_t *debug_register_mode(const char *name, int pages, int nr_areas,
-				  int buf_size, mode_t mode, uid_t uid,
+				  int buf_size, umode_t mode, uid_t uid,
 				  gid_t gid);
 
 void debug_unregister(debug_info_t* id);
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 5ad6bc078bfd..6848828b962e 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -74,7 +74,7 @@ static ssize_t debug_input(struct file *file, const char __user *user_buf,
 static int debug_open(struct inode *inode, struct file *file);
 static int debug_close(struct inode *inode, struct file *file);
 static debug_info_t *debug_info_create(const char *name, int pages_per_area,
-			int nr_areas, int buf_size, mode_t mode);
+			int nr_areas, int buf_size, umode_t mode);
 static void debug_info_get(debug_info_t *);
 static void debug_info_put(debug_info_t *);
 static int debug_prolog_level_fn(debug_info_t * id,
@@ -330,7 +330,7 @@ debug_info_free(debug_info_t* db_info){
 
 static debug_info_t*
 debug_info_create(const char *name, int pages_per_area, int nr_areas,
-		  int buf_size, mode_t mode)
+		  int buf_size, umode_t mode)
 {
 	debug_info_t* rc;
 
@@ -688,7 +688,7 @@ debug_close(struct inode *inode, struct file *file)
  */
 
 debug_info_t *debug_register_mode(const char *name, int pages_per_area,
-				  int nr_areas, int buf_size, mode_t mode,
+				  int nr_areas, int buf_size, umode_t mode,
 				  uid_t uid, gid_t gid)
 {
 	debug_info_t *rc = NULL;
@@ -1090,7 +1090,7 @@ debug_register_view(debug_info_t * id, struct debug_view *view)
 	int rc = 0;
 	int i;
 	unsigned long flags;
-	mode_t mode;
+	umode_t mode;
 	struct dentry *pde;
 
 	if (!id)
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 7c0fedd98ea0..ef1db1900d86 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -109,7 +109,7 @@ static const struct file_operations u32_array_fops = {
 	.llseek = no_llseek,
 };
 
-struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
 					    struct dentry *parent,
 					    u32 *array, unsigned elements)
 {
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index e28132084832..78d25499be5b 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,7 +3,7 @@
 
 struct dentry * __init xen_init_debugfs(void);
 
-struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
 					    struct dentry *parent,
 					    u32 *array, unsigned elements);
 
diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c
index 6c47ae9793a7..b258cab9061c 100644
--- a/drivers/acpi/ec_sys.c
+++ b/drivers/acpi/ec_sys.c
@@ -105,7 +105,7 @@ int acpi_ec_add_debugfs(struct acpi_ec *ec, unsigned int ec_device_count)
 {
 	struct dentry *dev_dir;
 	char name[64];
-	mode_t mode = 0400;
+	umode_t mode = 0400;
 
 	if (ec_device_count == 0) {
 		acpi_ec_debugfs_dir = debugfs_create_dir("ec", NULL);
diff --git a/drivers/mmc/card/mmc_test.c b/drivers/mmc/card/mmc_test.c
index b038c4a9468b..e99bdc18002d 100644
--- a/drivers/mmc/card/mmc_test.c
+++ b/drivers/mmc/card/mmc_test.c
@@ -2949,7 +2949,7 @@ static void mmc_test_free_dbgfs_file(struct mmc_card *card)
 }
 
 static int __mmc_test_register_dbgfs_file(struct mmc_card *card,
-	const char *name, mode_t mode, const struct file_operations *fops)
+	const char *name, umode_t mode, const struct file_operations *fops)
 {
 	struct dentry *file = NULL;
 	struct mmc_test_dbgfs_file *df;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index da9072bfca8b..f5a24d99ef4f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -2000,7 +2000,7 @@ static const struct file_operations interfaces_proc_fops = {
  */
 struct cxgb4vf_debugfs_entry {
 	const char *name;		/* name of debugfs node */
-	mode_t mode;			/* file system mode */
+	umode_t mode;			/* file system mode */
 	const struct file_operations *fops;
 };
 
diff --git a/drivers/net/wireless/ath/carl9170/debug.c b/drivers/net/wireless/ath/carl9170/debug.c
index de57f90e1d5f..3c164226687f 100644
--- a/drivers/net/wireless/ath/carl9170/debug.c
+++ b/drivers/net/wireless/ath/carl9170/debug.c
@@ -56,7 +56,7 @@ static int carl9170_debugfs_open(struct inode *inode, struct file *file)
 
 struct carl9170_debugfs_fops {
 	unsigned int read_bufsize;
-	mode_t attr;
+	umode_t attr;
 	char *(*read)(struct ar9170 *ar, char *buf, size_t bufsize,
 		      ssize_t *len);
 	ssize_t (*write)(struct ar9170 *aru, const char *buf, size_t size);
diff --git a/drivers/net/wireless/libertas/debugfs.c b/drivers/net/wireless/libertas/debugfs.c
index d8d8f0d0899f..c192671610fc 100644
--- a/drivers/net/wireless/libertas/debugfs.c
+++ b/drivers/net/wireless/libertas/debugfs.c
@@ -704,7 +704,7 @@ out_unlock:
 
 struct lbs_debugfs_files {
 	const char *name;
-	int perm;
+	umode_t perm;
 	struct file_operations fops;
 };
 
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 65894f05a801..42986d7bcf9d 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1073,7 +1073,7 @@ static const struct file_operations dasd_stats_global_fops = {
 static void dasd_profile_init(struct dasd_profile *profile,
 			      struct dentry *base_dentry)
 {
-	mode_t mode;
+	umode_t mode;
 	struct dentry *pde;
 
 	if (!base_dentry)
@@ -1112,7 +1112,7 @@ static void dasd_statistics_removeroot(void)
 
 static void dasd_statistics_createroot(void)
 {
-	mode_t mode;
+	umode_t mode;
 	struct dentry *pde;
 
 	dasd_debugfs_root_entry = NULL;
diff --git a/drivers/scsi/bfa/bfad_debugfs.c b/drivers/scsi/bfa/bfad_debugfs.c
index dee1a094c2c2..caca9b7c8309 100644
--- a/drivers/scsi/bfa/bfad_debugfs.c
+++ b/drivers/scsi/bfa/bfad_debugfs.c
@@ -472,7 +472,7 @@ static const struct file_operations bfad_debugfs_op_regwr = {
 
 struct bfad_debugfs_entry {
 	const char *name;
-	mode_t	mode;
+	umode_t	mode;
 	const struct file_operations *fops;
 };
 
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 90f76575c056..d5016606fb27 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -95,7 +95,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+struct dentry *debugfs_create_u8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -147,7 +147,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+struct dentry *debugfs_create_u16(const char *name, umode_t mode,
 				  struct dentry *parent, u16 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -199,7 +199,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+struct dentry *debugfs_create_u32(const char *name, umode_t mode,
 				 struct dentry *parent, u32 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -252,7 +252,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 				 struct dentry *parent, u64 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -298,7 +298,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n
  * @value: a pointer to the variable that the file should read to and write
  *         from.
  */
-struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+struct dentry *debugfs_create_x8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -322,7 +322,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
  * @value: a pointer to the variable that the file should read to and write
  *         from.
  */
-struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+struct dentry *debugfs_create_x16(const char *name, umode_t mode,
 				 struct dentry *parent, u16 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -346,7 +346,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
  * @value: a pointer to the variable that the file should read to and write
  *         from.
  */
-struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+struct dentry *debugfs_create_x32(const char *name, umode_t mode,
 				 struct dentry *parent, u32 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -370,7 +370,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
  * @value: a pointer to the variable that the file should read to and write
  *         from.
  */
-struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+struct dentry *debugfs_create_x64(const char *name, umode_t mode,
 				 struct dentry *parent, u64 *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_x64);
@@ -401,7 +401,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
  * @value: a pointer to the variable that the file should read to and write
  *         from.
  */
-struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 				     struct dentry *parent, size_t *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_size_t);
@@ -473,7 +473,7 @@ static const struct file_operations fops_bool = {
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 				   struct dentry *parent, u32 *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_bool);
@@ -518,7 +518,7 @@ static const struct file_operations fops_blob = {
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				   struct dentry *parent,
 				   struct debugfs_blob_wrapper *blob)
 {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c9dc08d0c100..956d5ddddf6e 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,7 +30,7 @@ static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
 
-static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev,
+static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev,
 				       void *data, const struct file_operations *fops)
 
 {
@@ -69,7 +69,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 
 /* SMP-safe */
 static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
-			 int mode, dev_t dev, void *data,
+			 umode_t mode, dev_t dev, void *data,
 			 const struct file_operations *fops)
 {
 	struct inode *inode;
@@ -87,7 +87,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
 	return error;
 }
 
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
+static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode,
 			 void *data, const struct file_operations *fops)
 {
 	int res;
@@ -101,14 +101,14 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
 	return res;
 }
 
-static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode,
+static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode,
 			void *data, const struct file_operations *fops)
 {
 	mode = (mode & S_IALLUGO) | S_IFLNK;
 	return debugfs_mknod(dir, dentry, mode, 0, data, fops);
 }
 
-static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 			  void *data, const struct file_operations *fops)
 {
 	int res;
@@ -146,7 +146,7 @@ static struct file_system_type debug_fs_type = {
 	.kill_sb =	kill_litter_super,
 };
 
-static int debugfs_create_by_name(const char *name, mode_t mode,
+static int debugfs_create_by_name(const char *name, umode_t mode,
 				  struct dentry *parent,
 				  struct dentry **dentry,
 				  void *data,
@@ -214,7 +214,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
  * If debugfs is not enabled in the kernel, the value -%ENODEV will be
  * returned.
  */
-struct dentry *debugfs_create_file(const char *name, mode_t mode,
+struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops)
 {
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index dc45deb19e68..73ba81928bce 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -553,7 +553,7 @@ void o2net_debugfs_exit(void)
 
 int o2net_debugfs_init(void)
 {
-	mode_t mode = S_IFREG|S_IRUSR;
+	umode_t mode = S_IFREG|S_IRUSR;
 
 	o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
 	if (o2net_dentry)
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index e7d9b20ddc5b..d1ac841e8dc7 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -34,7 +34,7 @@ extern struct dentry *arch_debugfs_dir;
 extern const struct file_operations debugfs_file_operations;
 extern const struct inode_operations debugfs_link_operations;
 
-struct dentry *debugfs_create_file(const char *name, mode_t mode,
+struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
 
@@ -49,28 +49,28 @@ void debugfs_remove_recursive(struct dentry *dentry);
 struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, const char *new_name);
 
-struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+struct dentry *debugfs_create_u8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value);
-struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+struct dentry *debugfs_create_u16(const char *name, umode_t mode,
 				  struct dentry *parent, u16 *value);
-struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+struct dentry *debugfs_create_u32(const char *name, umode_t mode,
 				  struct dentry *parent, u32 *value);
-struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 				  struct dentry *parent, u64 *value);
-struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+struct dentry *debugfs_create_x8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value);
-struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+struct dentry *debugfs_create_x16(const char *name, umode_t mode,
 				  struct dentry *parent, u16 *value);
-struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+struct dentry *debugfs_create_x32(const char *name, umode_t mode,
 				  struct dentry *parent, u32 *value);
-struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+struct dentry *debugfs_create_x64(const char *name, umode_t mode,
 				  struct dentry *parent, u64 *value);
-struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 				     struct dentry *parent, size_t *value);
-struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 				  struct dentry *parent, u32 *value);
 
-struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				  struct dentry *parent,
 				  struct debugfs_blob_wrapper *blob);
 
@@ -86,7 +86,7 @@ bool debugfs_initialized(void);
  * want to duplicate the design decision mistakes of procfs and devfs again.
  */
 
-static inline struct dentry *debugfs_create_file(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
 					const struct file_operations *fops)
 {
@@ -118,70 +118,70 @@ static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentr
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_u8(const char *name, umode_t mode,
 					       struct dentry *parent,
 					       u8 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_u16(const char *name, umode_t mode,
 						struct dentry *parent,
 						u16 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_u32(const char *name, umode_t mode,
 						struct dentry *parent,
 						u32 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 						struct dentry *parent,
 						u64 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_x8(const char *name, umode_t mode,
 					       struct dentry *parent,
 					       u8 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_x16(const char *name, umode_t mode,
 						struct dentry *parent,
 						u16 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_x32(const char *name, umode_t mode,
 						struct dentry *parent,
 						u32 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 				     struct dentry *parent,
 				     size_t *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 						 struct dentry *parent,
 						 u32 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				  struct dentry *parent,
 				  struct debugfs_blob_wrapper *blob)
 {
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 14a86bc7102b..a822fd71fd64 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -144,7 +144,7 @@ struct rchan_callbacks
 	 */
 	struct dentry *(*create_buf_file)(const char *filename,
 					  struct dentry *parent,
-					  int mode,
+					  umode_t mode,
 					  struct rchan_buf *buf,
 					  int *is_global);
 
diff --git a/kernel/relay.c b/kernel/relay.c
index 226fade4d727..4335e1d7ee2d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
  */
 static struct dentry *create_buf_file_default_callback(const char *filename,
 						       struct dentry *parent,
-						       int mode,
+						       umode_t mode,
 						       struct rchan_buf *buf,
 						       int *is_global)
 {
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 16fc34a0806f..cdea7b56b0c9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
 
 static struct dentry *blk_create_buf_file_callback(const char *filename,
 						   struct dentry *parent,
-						   int mode,
+						   umode_t mode,
 						   struct rchan_buf *buf,
 						   int *is_global)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f2bd275bb60f..660b069a0f99 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4385,7 +4385,7 @@ static const struct file_operations trace_options_core_fops = {
 };
 
 struct dentry *trace_create_file(const char *name,
-				 mode_t mode,
+				 umode_t mode,
 				 struct dentry *parent,
 				 void *data,
 				 const struct file_operations *fops)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 092e1f8d18dc..0154c0b850de 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu);
 void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *trace_create_file(const char *name,
-				 mode_t mode,
+				 umode_t mode,
 				 struct dentry *parent,
 				 void *data,
 				 const struct file_operations *fops);
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index 4f7554025e30..b4801f51b607 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -149,7 +149,7 @@ static int debugfs_ul_get(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_ul, debugfs_ul_get, debugfs_ul_set, "%llu\n");
 
-static struct dentry *debugfs_create_ul(const char *name, mode_t mode,
+static struct dentry *debugfs_create_ul(const char *name, umode_t mode,
 				struct dentry *parent, unsigned long *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_ul);
@@ -169,7 +169,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_stacktrace_depth, debugfs_ul_get,
 			debugfs_stacktrace_depth_set, "%llu\n");
 
 static struct dentry *debugfs_create_stacktrace_depth(
-	const char *name, mode_t mode,
+	const char *name, umode_t mode,
 	struct dentry *parent, unsigned long *value)
 {
 	return debugfs_create_file(name, mode, parent, value,
@@ -193,7 +193,7 @@ static int debugfs_atomic_t_get(void *data, u64 *val)
 DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
 			debugfs_atomic_t_set, "%lld\n");
 
-static struct dentry *debugfs_create_atomic_t(const char *name, mode_t mode,
+static struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
 				struct dentry *parent, atomic_t *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
@@ -202,7 +202,7 @@ static struct dentry *debugfs_create_atomic_t(const char *name, mode_t mode,
 struct dentry *fault_create_debugfs_attr(const char *name,
 			struct dentry *parent, struct fault_attr *attr)
 {
-	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 
 	dir = debugfs_create_dir(name, parent);
diff --git a/mm/failslab.c b/mm/failslab.c
index 0dd7b8fec71c..fefaabaab76d 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -35,7 +35,7 @@ __setup("failslab=", setup_failslab);
 static int __init failslab_debugfs_init(void)
 {
 	struct dentry *dir;
-	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 
 	dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
 	if (IS_ERR(dir))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2b8ba3aebf6e..99930ec7d140 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1408,7 +1408,7 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 
 static int __init fail_page_alloc_debugfs(void)
 {
-	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 
 	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
-- 
cgit v1.2.3-71-gd317


From a5e7ed3287e45f2eafbcf9e7e6fdc5a0191acf40 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 26 Jul 2011 01:55:55 -0400
Subject: cgroup: propagate mode_t

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/cgroup.h |  2 +-
 kernel/cgroup.c        | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1b7f9d525013..a17becc36ca1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -319,7 +319,7 @@ struct cftype {
 	 * If not 0, file mode is set to this value, otherwise it will
 	 * be figured out automatically
 	 */
-	mode_t mode;
+	umode_t mode;
 
 	/*
 	 * If non-zero, defines the maximum length of string that can
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b37a0ea55114..86ebacfd9431 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -775,7 +775,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
 static int alloc_css_id(struct cgroup_subsys *ss,
 			struct cgroup *parent, struct cgroup *child);
 
-static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
+static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
 
@@ -2585,7 +2585,7 @@ static inline struct cftype *__file_cft(struct file *file)
 	return __d_cft(file->f_dentry);
 }
 
-static int cgroup_create_file(struct dentry *dentry, mode_t mode,
+static int cgroup_create_file(struct dentry *dentry, umode_t mode,
 				struct super_block *sb)
 {
 	struct inode *inode;
@@ -2626,7 +2626,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
  * @mode: mode to set on new directory.
  */
 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-				mode_t mode)
+				umode_t mode)
 {
 	struct dentry *parent;
 	int error = 0;
@@ -2653,9 +2653,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
  * returns S_IRUGO if it has only a read handler
  * returns S_IWUSR if it has only a write hander
  */
-static mode_t cgroup_file_mode(const struct cftype *cft)
+static umode_t cgroup_file_mode(const struct cftype *cft)
 {
-	mode_t mode = 0;
+	umode_t mode = 0;
 
 	if (cft->mode)
 		return cft->mode;
@@ -2678,7 +2678,7 @@ int cgroup_add_file(struct cgroup *cgrp,
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
 	int error;
-	mode_t mode;
+	umode_t mode;
 
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -3752,7 +3752,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  * Must be called with the mutex on the parent inode held
  */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-			     mode_t mode)
+			     umode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroupfs_root *root = parent->root;
-- 
cgit v1.2.3-71-gd317


From 36fcb589e752fa9c71f8a447db94126d102fd937 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 26 Jul 2011 03:47:31 -0400
Subject: sysctl: use umode_t for table permissions

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/sysctl.h | 2 +-
 kernel/sched.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 703cfa33a3ca..bb9127dd814b 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1038,7 +1038,7 @@ struct ctl_table
 	const char *procname;		/* Text ID for /proc/sys, or zero */
 	void *data;
 	int maxlen;
-	mode_t mode;
+	umode_t mode;
 	struct ctl_table *child;
 	struct ctl_table *parent;	/* Automatically set */
 	proc_handler *proc_handler;	/* Callback for text formatting */
diff --git a/kernel/sched.c b/kernel/sched.c
index d6b149ccf925..e64f45741e0e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6480,7 +6480,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 static void
 set_table_entry(struct ctl_table *entry,
 		const char *procname, void *data, int maxlen,
-		mode_t mode, proc_handler *proc_handler)
+		umode_t mode, proc_handler *proc_handler)
 {
 	entry->procname = procname;
 	entry->data = data;
-- 
cgit v1.2.3-71-gd317


From df0a42837b86567a130c44515ab620d23e7f182b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 26 Jul 2011 05:26:10 -0400
Subject: switch mq_open() to umode_t

---
 include/linux/audit.h    | 4 ++--
 include/linux/syscalls.h | 2 +-
 ipc/mqueue.c             | 2 +-
 kernel/auditsc.c         | 6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2f81c6f3b630..75ed193b11f8 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -474,7 +474,7 @@ extern void audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern void __audit_fd_pair(int fd1, int fd2);
 extern int audit_set_macxattr(const char *name);
-extern void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr);
+extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
 extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
 extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
 extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
@@ -499,7 +499,7 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid
 	if (unlikely(!audit_dummy_context()))
 		__audit_ipc_set_perm(qbytes, uid, gid, mode);
 }
-static inline void audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
+static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
 	if (unlikely(!audit_dummy_context()))
 		__audit_mq_open(oflag, mode, attr);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e1a4b9b81cf2..d86e5253f84f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -679,7 +679,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
 asmlinkage long sys_ipc(unsigned int call, int first, unsigned long second,
 		unsigned long third, void __user *ptr, long fifth);
 
-asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr);
+asmlinkage long sys_mq_open(const char __user *name, int oflag, umode_t mode, struct mq_attr __user *attr);
 asmlinkage long sys_mq_unlink(const char __user *name);
 asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout);
 asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5eaecf4fc04e..9a142a290749 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -679,7 +679,7 @@ err:
 	return ERR_PTR(ret);
 }
 
-SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
+SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 		struct mq_attr __user *, u_attr)
 {
 	struct dentry *dentry;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 47b7fc1ea893..9849213e501c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -234,7 +234,7 @@ struct audit_context {
 		} mq_sendrecv;
 		struct {
 			int			oflag;
-			mode_t			mode;
+			umode_t			mode;
 			struct mq_attr		attr;
 		} mq_open;
 		struct {
@@ -1278,7 +1278,7 @@ static void show_special(struct audit_context *context, int *call_panic)
 		break; }
 	case AUDIT_MQ_OPEN: {
 		audit_log_format(ab,
-			"oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+			"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
 			"mq_msgsize=%ld mq_curmsgs=%ld",
 			context->mq_open.oflag, context->mq_open.mode,
 			context->mq_open.attr.mq_flags,
@@ -2160,7 +2160,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
  * @attr: queue attributes
  *
  */
-void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
+void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
 	struct audit_context *context = current->audit_context;
 
-- 
cgit v1.2.3-71-gd317


From 2570ebbd1f1ce1ef31f568b0660354fc59424be2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 27 Jul 2011 14:03:22 -0400
Subject: switch kern_ipc_perm to umode_t

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  4 ++--
 include/linux/ipc.h   |  2 +-
 kernel/auditsc.c      | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 75ed193b11f8..426ab9f4dd85 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -468,7 +468,7 @@ extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 #define audit_get_sessionid(t) ((t)->sessionid)
 extern void audit_log_task_context(struct audit_buffer *ab);
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
-extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
+extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern int audit_bprm(struct linux_binprm *bprm);
 extern void audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
@@ -494,7 +494,7 @@ static inline void audit_fd_pair(int fd1, int fd2)
 	if (unlikely(!audit_dummy_context()))
 		__audit_fd_pair(fd1, fd2);
 }
-static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
 {
 	if (unlikely(!audit_dummy_context()))
 		__audit_ipc_set_perm(qbytes, uid, gid, mode);
diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 3b1594d662b0..30e816148df4 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -93,7 +93,7 @@ struct kern_ipc_perm
 	gid_t		gid;
 	uid_t		cuid;
 	gid_t		cgid;
-	mode_t		mode; 
+	umode_t		mode; 
 	unsigned long	seq;
 	void		*security;
 };
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9849213e501c..7a074d65fff4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -210,12 +210,12 @@ struct audit_context {
 		struct {
 			uid_t			uid;
 			gid_t			gid;
-			mode_t			mode;
+			umode_t			mode;
 			u32			osid;
 			int			has_perm;
 			uid_t			perm_uid;
 			gid_t			perm_gid;
-			mode_t			perm_mode;
+			umode_t			perm_mode;
 			unsigned long		qbytes;
 		} ipc;
 		struct {
@@ -1249,7 +1249,7 @@ static void show_special(struct audit_context *context, int *call_panic)
 	case AUDIT_IPC: {
 		u32 osid = context->ipc.osid;
 
-		audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
+		audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
 			 context->ipc.uid, context->ipc.gid, context->ipc.mode);
 		if (osid) {
 			char *ctx = NULL;
@@ -1267,7 +1267,7 @@ static void show_special(struct audit_context *context, int *call_panic)
 			ab = audit_log_start(context, GFP_KERNEL,
 					     AUDIT_IPC_SET_PERM);
 			audit_log_format(ab,
-				"qbytes=%lx ouid=%u ogid=%u mode=%#o",
+				"qbytes=%lx ouid=%u ogid=%u mode=%#ho",
 				context->ipc.qbytes,
 				context->ipc.perm_uid,
 				context->ipc.perm_gid,
@@ -2260,7 +2260,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
  *
  * Called only after audit_ipc_obj().
  */
-void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
 {
 	struct audit_context *context = current->audit_context;
 
-- 
cgit v1.2.3-71-gd317


From 93d3a10ef4fdfd4b6d1a3f09b645cd08f74a8115 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 27 Jul 2011 14:04:25 -0400
Subject: auditsc: propage umode_t

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7a074d65fff4..e7fe2b0d29b3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -308,7 +308,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
 static int audit_match_filetype(struct audit_context *ctx, int which)
 {
 	unsigned index = which & ~S_IFMT;
-	mode_t mode = which & S_IFMT;
+	umode_t mode = which & S_IFMT;
 
 	if (unlikely(!ctx))
 		return 0;
@@ -1502,7 +1502,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 		if (n->ino != (unsigned long)-1) {
 			audit_log_format(ab, " inode=%lu"
-					 " dev=%02x:%02x mode=%#o"
+					 " dev=%02x:%02x mode=%#ho"
 					 " ouid=%u ogid=%u rdev=%02x:%02x",
 					 n->ino,
 					 MAJOR(n->dev),
-- 
cgit v1.2.3-71-gd317


From 305f3c8b20ba1ca94829329acdbf22e689304dba Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 4 Jan 2012 10:24:29 +0300
Subject: cgroup: move assignement out of condition in cgroup_attach_proc()

Gcc complains about this: "kernel/cgroup.c:2179:4: warning: suggest
parentheses around assignment used as truth value [-Wparentheses]"

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 548d8d4e86d0..bab5c17e7781 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2175,9 +2175,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		oldcg = tc->task->cgroups;
 
 		/* if we don't already have it in the list get a new one */
-		if (!css_set_check_fetched(cgrp, tc->task, oldcg, &newcg_list))
-			if (retval = css_set_prefetch(cgrp, oldcg, &newcg_list))
+		if (!css_set_check_fetched(cgrp, tc->task, oldcg,
+					   &newcg_list)) {
+			retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+			if (retval)
 				goto out_list_teardown;
+		}
 	}
 
 	/*
-- 
cgit v1.2.3-71-gd317


From c336078bf65c4d38caa9a4b8b7b7261c778e622c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Tue, 27 Dec 2011 22:54:52 +0100
Subject: PM / Hibernate: Implement compat_ioctl for /dev/snapshot

This allows uswsusp built for i386 to run on an x86_64 kernel (tested
with Debian package version 1.0+20110509-2).

References: http://bugs.debian.org/502816
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/user.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 78bdb4404aab..6b1ab7a88522 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -21,6 +21,7 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/fs.h>
+#include <linux/compat.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
@@ -380,6 +381,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 	return error;
 }
 
+#ifdef CONFIG_COMPAT
+
+struct compat_resume_swap_area {
+	compat_loff_t offset;
+	u32 dev;
+} __packed;
+
+static long
+snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
+
+	switch (cmd) {
+	case SNAPSHOT_GET_IMAGE_SIZE:
+	case SNAPSHOT_AVAIL_SWAP_SIZE:
+	case SNAPSHOT_ALLOC_SWAP_PAGE: {
+		compat_loff_t __user *uoffset = compat_ptr(arg);
+		loff_t offset;
+		mm_segment_t old_fs;
+		int err;
+
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
+		set_fs(old_fs);
+		if (!err && put_user(offset, uoffset))
+			err = -EFAULT;
+		return err;
+	}
+
+	case SNAPSHOT_CREATE_IMAGE:
+		return snapshot_ioctl(file, cmd,
+				      (unsigned long) compat_ptr(arg));
+
+	case SNAPSHOT_SET_SWAP_AREA: {
+		struct compat_resume_swap_area __user *u_swap_area =
+			compat_ptr(arg);
+		struct resume_swap_area swap_area;
+		mm_segment_t old_fs;
+		int err;
+
+		err = get_user(swap_area.offset, &u_swap_area->offset);
+		err |= get_user(swap_area.dev, &u_swap_area->dev);
+		if (err)
+			return -EFAULT;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
+				     (unsigned long) &swap_area);
+		set_fs(old_fs);
+		return err;
+	}
+
+	default:
+		return snapshot_ioctl(file, cmd, arg);
+	}
+}
+
+#endif /* CONFIG_COMPAT */
+
 static const struct file_operations snapshot_fops = {
 	.open = snapshot_open,
 	.release = snapshot_release,
@@ -387,6 +448,9 @@ static const struct file_operations snapshot_fops = {
 	.write = snapshot_write,
 	.llseek = no_llseek,
 	.unlocked_ioctl = snapshot_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = snapshot_compat_ioctl,
+#endif
 };
 
 static struct miscdevice snapshot_device = {
-- 
cgit v1.2.3-71-gd317


From 0d19ea866562e46989412a0676412fa0983c9ce7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 27 Dec 2011 14:25:55 +0800
Subject: cgroup: fix to allow mounting a hierarchy by name

If we mount a hierarchy with a specified name, the name is unique,
and we can use it to mount the hierarchy without specifying its
set of subsystem names. This feature is documented is
Documentation/cgroups/cgroups.txt section 2.3

Here's an example:

	# mount -t cgroup -o cpuset,name=myhier xxx /cgroup1
	# mount -t cgroup -o name=myhier xxx /cgroup2

But it was broken by commit 32a8cf235e2f192eb002755076994525cdbaa35a
(cgroup: make the mount options parsing more accurate)

This fixes the regression.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bab5c17e7781..39c7caef085a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1193,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 
 	/*
 	 * If the 'all' option was specified select all the subsystems,
-	 * otherwise 'all, 'none' and a subsystem name options were not
-	 * specified, let's default to 'all'
+	 * otherwise if 'none', 'name=' and a subsystem name options
+	 * were not specified, let's default to 'all'
 	 */
-	if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+	if (all_ss || (!one_ss && !opts->none && !opts->name)) {
 		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (ss == NULL)
-- 
cgit v1.2.3-71-gd317


From b7e724d303b684655e4ca3dabd5a6840ad19012d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilities: reverse arguments to security_capable

security_capable takes ns, cred, cap.  But the LSM capable() hook takes
cred, ns, cap.  The capability helper functions also take cred, ns, cap.
Rather than flip argument order just to flip it back, leave them alone.
Heck, this should be a little faster since argument will be in the right
place!

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 drivers/pci/pci-sysfs.c  | 2 +-
 include/linux/security.h | 6 +++---
 kernel/capability.c      | 2 +-
 security/security.c      | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 7bcf12adced7..a4457ab61342 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -431,7 +431,7 @@ pci_read_config(struct file *filp, struct kobject *kobj,
 	u8 *data = (u8*) buf;
 
 	/* Several chips lock up trying to read undefined config space */
-	if (security_capable(&init_user_ns, filp->f_cred, CAP_SYS_ADMIN) == 0) {
+	if (security_capable(filp->f_cred, &init_user_ns, CAP_SYS_ADMIN) == 0) {
 		size = dev->cfg_size;
 	} else if (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) {
 		size = 128;
diff --git a/include/linux/security.h b/include/linux/security.h
index 4921163b2752..ee969ff40a26 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1666,7 +1666,7 @@ int security_capset(struct cred *new, const struct cred *old,
 		    const kernel_cap_t *effective,
 		    const kernel_cap_t *inheritable,
 		    const kernel_cap_t *permitted);
-int security_capable(struct user_namespace *ns, const struct cred *cred,
+int security_capable(const struct cred *cred, struct user_namespace *ns,
 			int cap);
 int security_real_capable(struct task_struct *tsk, struct user_namespace *ns,
 			int cap);
@@ -1863,8 +1863,8 @@ static inline int security_capset(struct cred *new,
 	return cap_capset(new, old, effective, inheritable, permitted);
 }
 
-static inline int security_capable(struct user_namespace *ns,
-				   const struct cred *cred, int cap)
+static inline int security_capable(const struct cred *cred,
+				   struct user_namespace *ns, int cap)
 {
 	return cap_capable(cred, ns, cap, SECURITY_CAP_AUDIT);
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index 283c529f8b1c..d98392719adb 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -374,7 +374,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
 		BUG();
 	}
 
-	if (security_capable(ns, current_cred(), cap) == 0) {
+	if (security_capable(current_cred(), ns, cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return true;
 	}
diff --git a/security/security.c b/security/security.c
index 9ae68c64455e..b9e57f4fc44a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -154,7 +154,7 @@ int security_capset(struct cred *new, const struct cred *old,
 				    effective, inheritable, permitted);
 }
 
-int security_capable(struct user_namespace *ns, const struct cred *cred,
+int security_capable(const struct cred *cred, struct user_namespace *ns,
 		     int cap)
 {
 	return security_ops->capable(cred, ns, cap, SECURITY_CAP_AUDIT);
-- 
cgit v1.2.3-71-gd317


From 2920a8409de5a51575d03deca07e5bb2be6fc98d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilities: remove all _real_ interfaces

The name security_real_capable and security_real_capable_noaudit just don't
make much sense to me.  Convert them to use security_capable and
security_capable_noaudit.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 include/linux/security.h | 25 -------------------------
 kernel/capability.c      | 18 +++++++++++++++---
 security/security.c      | 24 ------------------------
 3 files changed, 15 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index caff54eee686..e345a9313a60 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1670,10 +1670,6 @@ int security_capable(const struct cred *cred, struct user_namespace *ns,
 			int cap);
 int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns,
 			     int cap);
-int security_real_capable(struct task_struct *tsk, struct user_namespace *ns,
-			int cap);
-int security_real_capable_noaudit(struct task_struct *tsk,
-			struct user_namespace *ns, int cap);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
 int security_syslog(int type);
@@ -1876,27 +1872,6 @@ static inline int security_capable_noaudit(const struct cred *cred,
 	return cap_capable(cred, ns, cap, SECURITY_CAP_NOAUDIT);
 }
 
-static inline int security_real_capable(struct task_struct *tsk, struct user_namespace *ns, int cap)
-{
-	int ret;
-
-	rcu_read_lock();
-	ret = cap_capable(__task_cred(tsk), ns, cap, SECURITY_CAP_AUDIT);
-	rcu_read_unlock();
-	return ret;
-}
-
-static inline
-int security_real_capable_noaudit(struct task_struct *tsk, struct user_namespace *ns, int cap)
-{
-	int ret;
-
-	rcu_read_lock();
-	ret = cap_capable(__task_cred(tsk), ns, cap, SECURITY_CAP_NOAUDIT);
-	rcu_read_unlock();
-	return ret;
-}
-
 static inline int security_quotactl(int cmds, int type, int id,
 				     struct super_block *sb)
 {
diff --git a/kernel/capability.c b/kernel/capability.c
index d98392719adb..ff50ab62cfca 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -298,7 +298,11 @@ error:
  */
 bool has_capability(struct task_struct *t, int cap)
 {
-	int ret = security_real_capable(t, &init_user_ns, cap);
+	int ret;
+
+	rcu_read_lock();
+	ret = security_capable(__task_cred(t), &init_user_ns, cap);
+	rcu_read_unlock();
 
 	return (ret == 0);
 }
@@ -317,7 +321,11 @@ bool has_capability(struct task_struct *t, int cap)
 bool has_ns_capability(struct task_struct *t,
 		       struct user_namespace *ns, int cap)
 {
-	int ret = security_real_capable(t, ns, cap);
+	int ret;
+
+	rcu_read_lock();
+	ret = security_capable(__task_cred(t), ns, cap);
+	rcu_read_unlock();
 
 	return (ret == 0);
 }
@@ -335,7 +343,11 @@ bool has_ns_capability(struct task_struct *t,
  */
 bool has_capability_noaudit(struct task_struct *t, int cap)
 {
-	int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+	int ret;
+
+	rcu_read_lock();
+	ret = security_capable_noaudit(__task_cred(t), &init_user_ns, cap);
+	rcu_read_unlock();
 
 	return (ret == 0);
 }
diff --git a/security/security.c b/security/security.c
index b7edaae77d1d..8900c5c4db5c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -166,30 +166,6 @@ int security_capable_noaudit(const struct cred *cred, struct user_namespace *ns,
 	return security_ops->capable(cred, ns, cap, SECURITY_CAP_NOAUDIT);
 }
 
-int security_real_capable(struct task_struct *tsk, struct user_namespace *ns,
-			  int cap)
-{
-	const struct cred *cred;
-	int ret;
-
-	cred = get_task_cred(tsk);
-	ret = security_ops->capable(cred, ns, cap, SECURITY_CAP_AUDIT);
-	put_cred(cred);
-	return ret;
-}
-
-int security_real_capable_noaudit(struct task_struct *tsk,
-				  struct user_namespace *ns, int cap)
-{
-	const struct cred *cred;
-	int ret;
-
-	cred = get_task_cred(tsk);
-	ret = security_ops->capable(cred, ns, cap, SECURITY_CAP_NOAUDIT);
-	put_cred(cred);
-	return ret;
-}
-
 int security_quotactl(int cmds, int type, int id, struct super_block *sb)
 {
 	return security_ops->quotactl(cmds, type, id, sb);
-- 
cgit v1.2.3-71-gd317


From 25e75703410a84b80623da3653db6b70282e5c6a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilities: call has_ns_capability from has_capability

Declare the more specific has_ns_capability first in the code and then call it
from has_capability.  The declaration reversal isn't stricty necessary since
they are both declared in header files, but it just makes sense to put more
specific functions first in the code.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 kernel/capability.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index ff50ab62cfca..fb815d1b9ea2 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -287,47 +287,41 @@ error:
 }
 
 /**
- * has_capability - Does a task have a capability in init_user_ns
+ * has_ns_capability - Does a task have a capability in a specific user ns
  * @t: The task in question
+ * @ns: target user namespace
  * @cap: The capability to be tested for
  *
  * Return true if the specified task has the given superior capability
- * currently in effect to the initial user namespace, false if not.
+ * currently in effect to the specified user namespace, false if not.
  *
  * Note that this does not set PF_SUPERPRIV on the task.
  */
-bool has_capability(struct task_struct *t, int cap)
+bool has_ns_capability(struct task_struct *t,
+		       struct user_namespace *ns, int cap)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = security_capable(__task_cred(t), &init_user_ns, cap);
+	ret = security_capable(__task_cred(t), ns, cap);
 	rcu_read_unlock();
 
 	return (ret == 0);
 }
 
 /**
- * has_capability - Does a task have a capability in a specific user ns
+ * has_capability - Does a task have a capability in init_user_ns
  * @t: The task in question
- * @ns: target user namespace
  * @cap: The capability to be tested for
  *
  * Return true if the specified task has the given superior capability
- * currently in effect to the specified user namespace, false if not.
+ * currently in effect to the initial user namespace, false if not.
  *
  * Note that this does not set PF_SUPERPRIV on the task.
  */
-bool has_ns_capability(struct task_struct *t,
-		       struct user_namespace *ns, int cap)
+bool has_capability(struct task_struct *t, int cap)
 {
-	int ret;
-
-	rcu_read_lock();
-	ret = security_capable(__task_cred(t), ns, cap);
-	rcu_read_unlock();
-
-	return (ret == 0);
+	return has_ns_capability(t, &init_user_ns, cap);
 }
 
 /**
-- 
cgit v1.2.3-71-gd317


From 7b61d648499e74dbec3d4ce645675e0ae040ae78 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilites: introduce new has_ns_capabilities_noaudit

For consistency in interfaces, introduce a new interface called
has_ns_capabilities_noaudit.  It checks if the given task has the given
capability in the given namespace.  Use this new function by
has_capabilities_noaudit.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 include/linux/capability.h |  2 ++
 kernel/capability.c        | 30 +++++++++++++++++++++++++-----
 2 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index c42112350003..63f59fa8769d 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -543,6 +543,8 @@ extern bool has_capability(struct task_struct *t, int cap);
 extern bool has_ns_capability(struct task_struct *t,
 			      struct user_namespace *ns, int cap);
 extern bool has_capability_noaudit(struct task_struct *t, int cap);
+extern bool has_ns_capability_noaudit(struct task_struct *t,
+				      struct user_namespace *ns, int cap);
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
 extern bool task_ns_capable(struct task_struct *t, int cap);
diff --git a/kernel/capability.c b/kernel/capability.c
index fb815d1b9ea2..d8398e962470 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -325,27 +325,47 @@ bool has_capability(struct task_struct *t, int cap)
 }
 
 /**
- * has_capability_noaudit - Does a task have a capability (unaudited)
+ * has_ns_capability_noaudit - Does a task have a capability (unaudited)
+ * in a specific user ns.
  * @t: The task in question
+ * @ns: target user namespace
  * @cap: The capability to be tested for
  *
  * Return true if the specified task has the given superior capability
- * currently in effect to init_user_ns, false if not.  Don't write an
- * audit message for the check.
+ * currently in effect to the specified user namespace, false if not.
+ * Do not write an audit message for the check.
  *
  * Note that this does not set PF_SUPERPRIV on the task.
  */
-bool has_capability_noaudit(struct task_struct *t, int cap)
+bool has_ns_capability_noaudit(struct task_struct *t,
+			       struct user_namespace *ns, int cap)
 {
 	int ret;
 
 	rcu_read_lock();
-	ret = security_capable_noaudit(__task_cred(t), &init_user_ns, cap);
+	ret = security_capable_noaudit(__task_cred(t), ns, cap);
 	rcu_read_unlock();
 
 	return (ret == 0);
 }
 
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited) in the
+ * initial user ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not.  Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+	return has_ns_capability_noaudit(t, &init_user_ns, cap);
+}
+
 /**
  * capable - Determine if the current task has a superior capability in effect
  * @cap: The capability to be tested for
-- 
cgit v1.2.3-71-gd317


From 105ddf49cd301b7929a92f269440e8e562ef19db Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilities: style only - move capable below ns_capable

Although the current code is fine for consistency this moves the capable
code below the function it calls in the c file.  It doesn't actually change
code.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 kernel/capability.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index d8398e962470..5f99e5d68e1f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -366,22 +366,6 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
 	return has_ns_capability_noaudit(t, &init_user_ns, cap);
 }
 
-/**
- * capable - Determine if the current task has a superior capability in effect
- * @cap: The capability to be tested for
- *
- * Return true if the current task has the given superior capability currently
- * available for use, false if not.
- *
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
- */
-bool capable(int cap)
-{
-	return ns_capable(&init_user_ns, cap);
-}
-EXPORT_SYMBOL(capable);
-
 /**
  * ns_capable - Determine if the current task has a superior capability in effect
  * @ns:  The usernamespace we want the capability in
@@ -408,6 +392,22 @@ bool ns_capable(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable);
 
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+	return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+
 /**
  * task_ns_capable - Determine whether current task has a superior
  * capability targeted at a specific task's user namespace.
-- 
cgit v1.2.3-71-gd317


From d2a7009f0bb03fa22ad08dd25472efa0568126b9 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabitlies: ns_capable can use the cap helpers rather than lsm call

Just to reduce the number of places to change if we every change the LSM
hook, use the capability helpers internally when possible.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 kernel/capability.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 5f99e5d68e1f..47626446c39a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
 		BUG();
 	}
 
-	if (security_capable(current_cred(), ns, cap) == 0) {
+	if (has_ns_capability(current, ns, cap)) {
 		current->flags |= PF_SUPERPRIV;
 		return true;
 	}
-- 
cgit v1.2.3-71-gd317


From f1c84dae0ecc51aa35c81f19a0ebcd6c0921ddcb Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: capabilities: remove task_ns_* functions

task_ in the front of a function, in the security subsystem anyway, means
to me at least, that we are operating with that task as the subject of the
security decision.  In this case what it means is that we are using current as
the subject but we use the task to get the right namespace.  Who in the world
would ever realize that's what task_ns_capability means just by the name?  This
patch eliminates the task_ns functions entirely and uses the has_ns_capability
function instead.  This means we explicitly open code the ns in question in
the caller.  I think it makes the caller a LOT more clear what is going on.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 include/linux/capability.h |  1 -
 include/linux/cred.h       |  6 ++++--
 kernel/capability.c        | 14 --------------
 kernel/ptrace.c            |  4 ++--
 kernel/sched.c             |  2 +-
 5 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 63f59fa8769d..e3e8d9cb9b08 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -547,7 +547,6 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
 				      struct user_namespace *ns, int cap);
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
-extern bool task_ns_capable(struct task_struct *t, int cap);
 extern bool nsown_capable(int cap);
 
 /* audit system wants to get cap info from files as well */
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 40308969ed00..adadf71a7327 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -358,10 +358,12 @@ static inline void put_cred(const struct cred *_cred)
 #define current_security()	(current_cred_xxx(security))
 
 #ifdef CONFIG_USER_NS
-#define current_user_ns() (current_cred_xxx(user_ns))
+#define current_user_ns()	(current_cred_xxx(user_ns))
+#define task_user_ns(task)	(task_cred_xxx((task), user_ns))
 #else
 extern struct user_namespace init_user_ns;
-#define current_user_ns() (&init_user_ns)
+#define current_user_ns()	(&init_user_ns)
+#define task_user_ns(task)	(&init_user_ns)
 #endif
 
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 47626446c39a..74fb3b603045 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -408,20 +408,6 @@ bool capable(int cap)
 }
 EXPORT_SYMBOL(capable);
 
-/**
- * task_ns_capable - Determine whether current task has a superior
- * capability targeted at a specific task's user namespace.
- * @t: The task whose user namespace is targeted.
- * @cap: The capability in question.
- *
- *  Return true if it does, false otherwise.
- */
-bool task_ns_capable(struct task_struct *t, int cap)
-{
-	return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
-}
-EXPORT_SYMBOL(task_ns_capable);
-
 /**
  * nsown_capable - Check superior capability to one's own user_ns
  * @cap: The capability in question
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a70d2a5d8c7b..210bbf045ee9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -196,7 +196,7 @@ ok:
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
+	if (!dumpable && !ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
 		return -EPERM;
 
 	return security_ptrace_access_check(task, mode);
@@ -266,7 +266,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 	task->ptrace = PT_PTRACED;
 	if (seize)
 		task->ptrace |= PT_SEIZED;
-	if (task_ns_capable(task, CAP_SYS_PTRACE))
+	if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
 		task->ptrace |= PT_PTRACE_CAP;
 
 	__ptrace_link(task, current);
diff --git a/kernel/sched.c b/kernel/sched.c
index b50b0f0c9aa9..5670028a9c16 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5409,7 +5409,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
-	if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
+	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p);
-- 
cgit v1.2.3-71-gd317


From 69f594a38967f4540ce7a29b3fd214e68a8330bd Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:15 -0500
Subject: ptrace: do not audit capability check when outputing /proc/pid/stat

Reading /proc/pid/stat of another process checks if one has ptrace permissions
on that process.  If one does have permissions it outputs some data about the
process which might have security and attack implications.  If the current
task does not have ptrace permissions the read still works, but those fields
are filled with inocuous (0) values.  Since this check and a subsequent denial
is not a violation of the security policy we should not audit such denials.

This can be quite useful to removing ptrace broadly across a system without
flooding the logs when ps is run or something which harmlessly walks proc.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 fs/proc/array.c          |  2 +-
 include/linux/ptrace.h   |  5 +++--
 kernel/ptrace.c          | 12 ++++++++++--
 security/selinux/hooks.c |  2 +-
 4 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3a1dafd228d1..ddffd7a88b97 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
-	permitted = ptrace_may_access(task, PTRACE_MODE_READ);
+	permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
 	mm = get_task_mm(task);
 	if (mm) {
 		vsize = task_vsize(mm);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 800f113bea66..a27e56ca41a4 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -127,8 +127,9 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void exit_ptrace(struct task_struct *tracer);
-#define PTRACE_MODE_READ   1
-#define PTRACE_MODE_ATTACH 2
+#define PTRACE_MODE_READ	0x01
+#define PTRACE_MODE_ATTACH	0x02
+#define PTRACE_MODE_NOAUDIT	0x04
 /* Returns 0 on success, -errno on denial. */
 extern int __ptrace_may_access(struct task_struct *task, unsigned int mode);
 /* Returns true on success, false on denial. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 210bbf045ee9..c890ac9a7962 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -161,6 +161,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 	return ret;
 }
 
+static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+{
+	if (mode & PTRACE_MODE_NOAUDIT)
+		return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+	else
+		return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+}
+
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	const struct cred *cred = current_cred(), *tcred;
@@ -187,7 +195,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	     cred->gid == tcred->sgid &&
 	     cred->gid == tcred->gid))
 		goto ok;
-	if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+	if (ptrace_has_cap(tcred->user->user_ns, mode))
 		goto ok;
 	rcu_read_unlock();
 	return -EPERM;
@@ -196,7 +204,7 @@ ok:
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable && !ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
+	if (!dumpable  && !ptrace_has_cap(task_user_ns(task), mode))
 		return -EPERM;
 
 	return security_ptrace_access_check(task, mode);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c9605c4a2e08..14f94cd29c80 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1809,7 +1809,7 @@ static int selinux_ptrace_access_check(struct task_struct *child,
 	if (rc)
 		return rc;
 
-	if (mode == PTRACE_MODE_READ) {
+	if (mode & PTRACE_MODE_READ) {
 		u32 sid = current_sid();
 		u32 csid = task_sid(child);
 		return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ, NULL);
-- 
cgit v1.2.3-71-gd317


From fd778461524849afd035679030ae8e8873c72b81 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 12:25:16 -0500
Subject: security: remove the security_netlink_recv hook as it is equivalent
 to capable()

Once upon a time netlink was not sync and we had to get the effective
capabilities from the skb that was being received.  Today we instead get
the capabilities from the current task.  This has rendered the entire
purpose of the hook moot as it is now functionally equivalent to the
capable() call.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 drivers/scsi/scsi_netlink.c     |  2 +-
 include/linux/security.h        | 14 --------------
 kernel/audit.c                  |  4 ++--
 net/core/rtnetlink.c            |  2 +-
 net/decnet/netfilter/dn_rtmsg.c |  2 +-
 net/ipv4/netfilter/ip_queue.c   |  2 +-
 net/ipv6/netfilter/ip6_queue.c  |  2 +-
 net/netfilter/nfnetlink.c       |  2 +-
 net/netlink/genetlink.c         |  2 +-
 net/xfrm/xfrm_user.c            |  2 +-
 security/capability.c           |  1 -
 security/commoncap.c            |  8 --------
 security/security.c             |  6 ------
 security/selinux/hooks.c        | 19 -------------------
 14 files changed, 10 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index 26a8a45584ef..feee1cc39ea0 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -111,7 +111,7 @@ scsi_nl_rcv_msg(struct sk_buff *skb)
 			goto next_msg;
 		}
 
-		if (security_netlink_recv(skb, CAP_SYS_ADMIN)) {
+		if (!capable(CAP_SYS_ADMIN)) {
 			err = -EPERM;
 			goto next_msg;
 		}
diff --git a/include/linux/security.h b/include/linux/security.h
index e345a9313a60..ba2d531c123f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -95,7 +95,6 @@ struct xfrm_user_sec_ctx;
 struct seq_file;
 
 extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
-extern int cap_netlink_recv(struct sk_buff *skb, int cap);
 
 void reset_security_ops(void);
 
@@ -792,12 +791,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@skb contains the sk_buff structure for the netlink message.
  *	Return 0 if the information was successfully saved and message
  *	is allowed to be transmitted.
- * @netlink_recv:
- *	Check permission before processing the received netlink message in
- *	@skb.
- *	@skb contains the sk_buff structure for the netlink message.
- *	@cap indicates the capability required
- *	Return 0 if permission is granted.
  *
  * Security hooks for Unix domain networking.
  *
@@ -1556,7 +1549,6 @@ struct security_operations {
 			  struct sembuf *sops, unsigned nsops, int alter);
 
 	int (*netlink_send) (struct sock *sk, struct sk_buff *skb);
-	int (*netlink_recv) (struct sk_buff *skb, int cap);
 
 	void (*d_instantiate) (struct dentry *dentry, struct inode *inode);
 
@@ -1803,7 +1795,6 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode);
 int security_getprocattr(struct task_struct *p, char *name, char **value);
 int security_setprocattr(struct task_struct *p, char *name, void *value, size_t size);
 int security_netlink_send(struct sock *sk, struct sk_buff *skb);
-int security_netlink_recv(struct sk_buff *skb, int cap);
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
 int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
 void security_release_secctx(char *secdata, u32 seclen);
@@ -2478,11 +2469,6 @@ static inline int security_netlink_send(struct sock *sk, struct sk_buff *skb)
 	return cap_netlink_send(sk, skb);
 }
 
-static inline int security_netlink_recv(struct sk_buff *skb, int cap)
-{
-	return cap_netlink_recv(skb, cap);
-}
-
 static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
 	return -EOPNOTSUPP;
diff --git a/kernel/audit.c b/kernel/audit.c
index 0a1355ca3d79..f3ba55fa0b70 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	case AUDIT_TTY_SET:
 	case AUDIT_TRIM:
 	case AUDIT_MAKE_EQUIV:
-		if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
+		if (!capable(CAP_AUDIT_CONTROL))
 			err = -EPERM;
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
 	case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
-		if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
+		if (!capable(CAP_AUDIT_WRITE))
 			err = -EPERM;
 		break;
 	default:  /* bad msg */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 99d9e953fe39..d3a628196716 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1931,7 +1931,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	sz_idx = type>>2;
 	kind = type&3;
 
-	if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (kind != 2 && !capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 69975e0bcdea..1531135130db 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -108,7 +108,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
 	if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
 		return;
 
-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
 	/* Eventually we might send routing messages too */
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index e59aabd0eae4..ffabb2674718 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -430,7 +430,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
 	if (type <= IPQM_BASE)
 		return;
 
-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
 	spin_lock_bh(&queue_lock);
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index e63c3972a739..5e5ce778be7f 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -431,7 +431,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
 	if (type <= IPQM_BASE)
 		return;
 
-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
 	spin_lock_bh(&queue_lock);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 1905976b5135..e6c2b8f32180 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -130,7 +130,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	const struct nfnetlink_subsystem *ss;
 	int type, err;
 
-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	/* All the messages must at least contain nfgenmsg */
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 482fa571b4ee..05fedbf489a5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -516,7 +516,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return -EOPNOTSUPP;
 
 	if ((ops->flags & GENL_ADMIN_PERM) &&
-	    security_netlink_recv(skb, CAP_NET_ADMIN))
+	    !capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0256b8a0a7cf..71de86698efa 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2290,7 +2290,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	link = &xfrm_dispatch[type];
 
 	/* All operations require privileges, even GET */
-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
diff --git a/security/capability.c b/security/capability.c
index 2984ea4f776f..a2c064d10448 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -999,7 +999,6 @@ void __init security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, sem_semctl);
 	set_to_cap_if_null(ops, sem_semop);
 	set_to_cap_if_null(ops, netlink_send);
-	set_to_cap_if_null(ops, netlink_recv);
 	set_to_cap_if_null(ops, d_instantiate);
 	set_to_cap_if_null(ops, getprocattr);
 	set_to_cap_if_null(ops, setprocattr);
diff --git a/security/commoncap.c b/security/commoncap.c
index 89f02ff66af9..7817a763444d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -56,14 +56,6 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-int cap_netlink_recv(struct sk_buff *skb, int cap)
-{
-	if (!cap_raised(current_cap(), cap))
-		return -EPERM;
-	return 0;
-}
-EXPORT_SYMBOL(cap_netlink_recv);
-
 /**
  * cap_capable - Determine whether a task has a particular effective capability
  * @cred: The credentials to use
diff --git a/security/security.c b/security/security.c
index 8900c5c4db5c..85481a9c5632 100644
--- a/security/security.c
+++ b/security/security.c
@@ -922,12 +922,6 @@ int security_netlink_send(struct sock *sk, struct sk_buff *skb)
 	return security_ops->netlink_send(sk, skb);
 }
 
-int security_netlink_recv(struct sk_buff *skb, int cap)
-{
-	return security_ops->netlink_recv(skb, cap);
-}
-EXPORT_SYMBOL(security_netlink_recv);
-
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
 	return security_ops->secid_to_secctx(secid, secdata, seclen);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 14f94cd29c80..3e37d25a9bbe 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4713,24 +4713,6 @@ static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
 	return selinux_nlmsg_perm(sk, skb);
 }
 
-static int selinux_netlink_recv(struct sk_buff *skb, int capability)
-{
-	int err;
-	struct common_audit_data ad;
-	u32 sid;
-
-	err = cap_netlink_recv(skb, capability);
-	if (err)
-		return err;
-
-	COMMON_AUDIT_DATA_INIT(&ad, CAP);
-	ad.u.cap = capability;
-
-	security_task_getsecid(current, &sid);
-	return avc_has_perm(sid, sid, SECCLASS_CAPABILITY,
-			    CAP_TO_MASK(capability), &ad);
-}
-
 static int ipc_alloc_security(struct task_struct *task,
 			      struct kern_ipc_perm *perm,
 			      u16 sclass)
@@ -5459,7 +5441,6 @@ static struct security_operations selinux_ops = {
 	.vm_enough_memory =		selinux_vm_enough_memory,
 
 	.netlink_send =			selinux_netlink_send,
-	.netlink_recv =			selinux_netlink_recv,
 
 	.bprm_set_creds =		selinux_bprm_set_creds,
 	.bprm_committing_creds =	selinux_bprm_committing_creds,
-- 
cgit v1.2.3-71-gd317


From d8c9584ea2a92879f471fd3a2be3af6c534fb035 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Dec 2011 18:16:57 -0500
Subject: vfs: prefer ->dentry->d_sb to ->mnt->mnt_sb

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/dev-ioctl.c     | 10 +++++-----
 fs/binfmt_misc.c           |  6 +++---
 fs/ext3/super.c            |  2 +-
 fs/ext4/super.c            |  2 +-
 fs/fhandle.c               |  4 ++--
 fs/lockd/svcsubs.c         |  2 +-
 fs/nfsd/nfsctl.c           |  2 +-
 fs/quota/dquot.c           |  2 +-
 fs/reiserfs/super.c        |  2 +-
 fs/sysv/itree.c            |  2 +-
 init/do_mounts.c           | 10 ++++++----
 kernel/acct.c              |  2 +-
 security/selinux/hooks.c   |  2 +-
 security/smack/smack_lsm.c |  4 ++--
 14 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 509fe1eb66ae..76741d8d7786 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -194,7 +194,7 @@ static int find_autofs_mount(const char *pathname,
 		return err;
 	err = -ENOENT;
 	while (path.dentry == path.mnt->mnt_root) {
-		if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
+		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
 			if (test(&path, data)) {
 				path_get(&path);
 				if (!err) /* already found some */
@@ -212,7 +212,7 @@ static int find_autofs_mount(const char *pathname,
 
 static int test_by_dev(struct path *path, void *p)
 {
-	return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
+	return path->dentry->d_sb->s_dev == *(dev_t *)p;
 }
 
 static int test_by_type(struct path *path, void *p)
@@ -538,11 +538,11 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 			err = find_autofs_mount(name, &path, test_by_type, &type);
 		if (err)
 			goto out;
-		devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
+		devid = new_encode_dev(path.dentry->d_sb->s_dev);
 		err = 0;
 		if (path.mnt->mnt_root == path.dentry) {
 			err = 1;
-			magic = path.mnt->mnt_sb->s_magic;
+			magic = path.dentry->d_sb->s_magic;
 		}
 	} else {
 		dev_t dev = sbi->sb->s_dev;
@@ -556,7 +556,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 		err = have_submounts(path.dentry);
 
 		if (follow_down_one(&path))
-			magic = path.mnt->mnt_sb->s_magic;
+			magic = path.dentry->d_sb->s_magic;
 	}
 
 	param->ismountpoint.out.devid = devid;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1e9edbdeda7e..a9198dfd5f85 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -560,7 +560,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 			break;
 		case 2: set_bit(Enabled, &e->flags);
 			break;
-		case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
+		case 3: root = dget(file->f_path.dentry->d_sb->s_root);
 			mutex_lock(&root->d_inode->i_mutex);
 
 			kill_node(e);
@@ -587,7 +587,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	Node *e;
 	struct inode *inode;
 	struct dentry *root, *dentry;
-	struct super_block *sb = file->f_path.mnt->mnt_sb;
+	struct super_block *sb = file->f_path.dentry->d_sb;
 	int err = 0;
 
 	e = create_entry(buffer, count);
@@ -666,7 +666,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 	switch (res) {
 		case 1: enabled = 0; break;
 		case 2: enabled = 1; break;
-		case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
+		case 3: root = dget(file->f_path.dentry->d_sb->s_root);
 			mutex_lock(&root->d_inode->i_mutex);
 
 			while (!list_empty(&entries))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 668c931b2214..7e8944ee67c6 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2909,7 +2909,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 		return -EINVAL;
 
 	/* Quotafile not on the same filesystem? */
-	if (path->mnt->mnt_sb != sb)
+	if (path->dentry->d_sb != sb)
 		return -EXDEV;
 	/* Journaling quota? */
 	if (EXT3_SB(sb)->s_qf_names[type]) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2a1a9e63cffa..b739b210a616 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4781,7 +4781,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		return -EINVAL;
 
 	/* Quotafile not on the same filesystem? */
-	if (path->mnt->mnt_sb != sb)
+	if (path->dentry->d_sb != sb)
 		return -EXDEV;
 	/* Journaling quota? */
 	if (EXT4_SB(sb)->s_qf_names[type]) {
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 5eff7116951e..a48e4a139be1 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -25,8 +25,8 @@ static long do_sys_name_to_handle(struct path *path,
 	 * We need t make sure wether the file system
 	 * support decoding of the file handle
 	 */
-	if (!path->mnt->mnt_sb->s_export_op ||
-	    !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+	if (!path->dentry->d_sb->s_export_op ||
+	    !path->dentry->d_sb->s_export_op->fh_to_dentry)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 1ca0679c80bf..2240d384d787 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -403,7 +403,7 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file)
 {
 	struct super_block *sb = datap;
 
-	return sb == file->f_file->f_path.mnt->mnt_sb;
+	return sb == file->f_file->f_path.dentry->d_sb;
 }
 
 /**
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c45a2ea4a090..bb4a11d58a5a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -272,7 +272,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 	 * 2.  Is that directory a mount point, or
 	 * 3.  Is that directory the root of an exported file system?
 	 */
-	error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
+	error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb);
 
 	path_put(&path);
 	return error;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 5d81e92daf83..5ec59b20cf76 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2198,7 +2198,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id,
 	if (error)
 		return error;
 	/* Quota file not on the same filesystem? */
-	if (path->mnt->mnt_sb != sb)
+	if (path->dentry->d_sb != sb)
 		error = -EXDEV;
 	else
 		error = vfs_load_quota_inode(path->dentry->d_inode, type,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5a4cae7efc43..1abffa451529 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2058,7 +2058,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 		return -EINVAL;
 
 	/* Quotafile not on the same filesystem? */
-	if (path->mnt->mnt_sb != sb) {
+	if (path->dentry->d_sb != sb) {
 		err = -EXDEV;
 		goto out;
 	}
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index fa8d43c92bb8..90b54b438789 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -442,7 +442,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
 
 int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
-	struct super_block *s = mnt->mnt_sb;
+	struct super_block *s = dentry->d_sb;
 	generic_fillattr(dentry->d_inode, stat);
 	stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
 	stat->blksize = s->s_blocksize;
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 0f6e1d985a3b..b2eee02e0f83 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -325,17 +325,19 @@ static void __init get_fs_names(char *page)
 
 static int __init do_mount_root(char *name, char *fs, int flags, void *data)
 {
+	struct super_block *s;
 	int err = sys_mount(name, "/root", fs, flags, data);
 	if (err)
 		return err;
 
 	sys_chdir((const char __user __force *)"/root");
-	ROOT_DEV = current->fs->pwd.mnt->mnt_sb->s_dev;
+	s = current->fs->pwd.dentry->d_sb;
+	ROOT_DEV = s->s_dev;
 	printk(KERN_INFO
 	       "VFS: Mounted root (%s filesystem)%s on device %u:%u.\n",
-	       current->fs->pwd.mnt->mnt_sb->s_type->name,
-	       current->fs->pwd.mnt->mnt_sb->s_flags & MS_RDONLY ?
-	       " readonly" : "", MAJOR(ROOT_DEV), MINOR(ROOT_DEV));
+	       s->s_type->name,
+	       s->s_flags & MS_RDONLY ?  " readonly" : "",
+	       MAJOR(ROOT_DEV), MINOR(ROOT_DEV));
 	return 0;
 }
 
diff --git a/kernel/acct.c b/kernel/acct.c
index 8cba12429d82..9663eb8058f9 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -315,7 +315,7 @@ void acct_auto_close(struct super_block *sb)
 	spin_lock(&acct_lock);
 restart:
 	list_for_each_entry(acct, &acct_list, list)
-		if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+		if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
 			acct_file_reopen(acct, NULL, NULL);
 			goto restart;
 		}
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4def4d92aaee..57546cfb18ec 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2507,7 +2507,7 @@ static int selinux_mount(char *dev_name,
 	const struct cred *cred = current_cred();
 
 	if (flags & MS_REMOUNT)
-		return superblock_has_perm(cred, path->mnt->mnt_sb,
+		return superblock_has_perm(cred, path->dentry->d_sb,
 					   FILESYSTEM__REMOUNT, NULL);
 	else
 		return path_has_perm(cred, path, FILE__MOUNTON);
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 7db62b48eb42..e8af5b0ba80f 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -406,7 +406,7 @@ static int smack_sb_statfs(struct dentry *dentry)
 static int smack_sb_mount(char *dev_name, struct path *path,
 			  char *type, unsigned long flags, void *data)
 {
-	struct superblock_smack *sbp = path->mnt->mnt_sb->s_security;
+	struct superblock_smack *sbp = path->dentry->d_sb->s_security;
 	struct smk_audit_info ad;
 
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
@@ -435,7 +435,7 @@ static int smack_sb_umount(struct vfsmount *mnt, int flags)
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
 	smk_ad_setfield_u_fs_path(&ad, path);
 
-	sbp = mnt->mnt_sb->s_security;
+	sbp = path.dentry->d_sb->s_security;
 	return smk_curacc(sbp->smk_floor, MAY_WRITE, &ad);
 }
 
-- 
cgit v1.2.3-71-gd317


From 34c80b1d93e6e20ca9dea0baf583a5b5510d92d4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 8 Dec 2011 21:32:45 -0500
Subject: vfs: switch ->show_options() to struct dentry *

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  2 +-
 Documentation/filesystems/vfs.txt |  2 +-
 arch/s390/hypfs/inode.c           |  4 ++--
 drivers/staging/pohmelfs/inode.c  |  4 ++--
 drivers/usb/core/inode.c          |  2 +-
 fs/adfs/super.c                   |  4 ++--
 fs/autofs4/inode.c                |  6 +++---
 fs/btrfs/super.c                  |  4 ++--
 fs/ceph/super.c                   |  6 +++---
 fs/cifs/cifsfs.c                  |  6 +++---
 fs/devpts/inode.c                 |  4 ++--
 fs/ecryptfs/super.c               |  4 ++--
 fs/ext2/super.c                   |  4 ++--
 fs/ext3/super.c                   |  4 ++--
 fs/ext4/super.c                   |  4 ++--
 fs/fat/inode.c                    |  6 +++---
 fs/fuse/inode.c                   | 10 +++++-----
 fs/gfs2/super.c                   |  8 ++++----
 fs/hfs/super.c                    |  4 ++--
 fs/hfsplus/hfsplus_fs.h           |  2 +-
 fs/hfsplus/options.c              |  4 ++--
 fs/hostfs/hostfs_kern.c           |  4 ++--
 fs/jffs2/super.c                  |  4 ++--
 fs/jfs/super.c                    |  4 ++--
 fs/namespace.c                    |  4 ++--
 fs/ncpfs/inode.c                  |  6 +++---
 fs/nfs/super.c                    |  6 +++---
 fs/nilfs2/super.c                 |  6 +++---
 fs/ntfs/inode.c                   |  8 ++++----
 fs/ntfs/inode.h                   |  2 +-
 fs/ocfs2/super.c                  |  9 ++++-----
 fs/proc_namespace.c               |  4 ++--
 fs/ubifs/super.c                  |  4 ++--
 fs/udf/super.c                    |  6 +++---
 fs/ufs/super.c                    |  4 ++--
 fs/xfs/xfs_super.c                |  4 ++--
 include/linux/fs.h                |  4 ++--
 kernel/cgroup.c                   |  4 ++--
 mm/shmem.c                        |  4 ++--
 39 files changed, 90 insertions(+), 91 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 9e9f30b9f46b..4fca82e5276e 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -117,7 +117,7 @@ prototypes:
 	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*umount_begin) (struct super_block *);
-	int (*show_options)(struct seq_file *, struct vfsmount *);
+	int (*show_options)(struct seq_file *, struct dentry *);
 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 4b9f0d092a79..3d9393b845b8 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -225,7 +225,7 @@ struct super_operations {
         void (*clear_inode) (struct inode *);
         void (*umount_begin) (struct super_block *);
 
-        int (*show_options)(struct seq_file *, struct vfsmount *);
+        int (*show_options)(struct seq_file *, struct dentry *);
 
         ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
         ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 98efd2d6207a..8a2a887478cc 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -259,9 +259,9 @@ static int hypfs_parse_options(char *options, struct super_block *sb)
 	return 0;
 }
 
-static int hypfs_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int hypfs_show_options(struct seq_file *s, struct dentry *root)
 {
-	struct hypfs_sb_info *hypfs_info = mnt->mnt_sb->s_fs_info;
+	struct hypfs_sb_info *hypfs_info = root->d_sb->s_fs_info;
 
 	seq_printf(s, ",uid=%u", hypfs_info->uid);
 	seq_printf(s, ",gid=%u", hypfs_info->gid);
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 91ec29e112bc..807e3f324113 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -1369,9 +1369,9 @@ static int pohmelfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int pohmelfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int pohmelfs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct pohmelfs_sb *psb = POHMELFS_SB(vfs->mnt_sb);
+	struct pohmelfs_sb *psb = POHMELFS_SB(root->d_sb);
 
 	seq_printf(seq, ",idx=%u", psb->idx);
 	seq_printf(seq, ",trans_scan_timeout=%u", jiffies_to_msecs(psb->trans_scan_timeout));
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 2b60af2ce3ba..9e186f3da839 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -65,7 +65,7 @@ static umode_t devmode = USBFS_DEFAULT_DEVMODE;
 static umode_t busmode = USBFS_DEFAULT_BUSMODE;
 static umode_t listmode = USBFS_DEFAULT_LISTMODE;
 
-static int usbfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+static int usbfs_show_options(struct seq_file *seq, struct dentry *root)
 {
 	if (devuid != 0)
 		seq_printf(seq, ",devuid=%u", devuid);
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index c8bf36a1996a..8e3b36ace305 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -126,9 +126,9 @@ static void adfs_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+static int adfs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct adfs_sb_info *asb = ADFS_SB(mnt->mnt_sb);
+	struct adfs_sb_info *asb = ADFS_SB(root->d_sb);
 
 	if (asb->s_uid != 0)
 		seq_printf(seq, ",uid=%u", asb->s_uid);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index f799efad52a8..2ba44c79d548 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -70,10 +70,10 @@ out_kill_sb:
 	kill_litter_super(sb);
 }
 
-static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int autofs4_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct autofs_sb_info *sbi = autofs4_sbi(mnt->mnt_sb);
-	struct inode *root_inode = mnt->mnt_sb->s_root->d_inode;
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+	struct inode *root_inode = root->d_sb->s_root->d_inode;
 
 	if (!sbi)
 		return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index dc62d3cc68fd..ae488aa1966a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -661,9 +661,9 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	return ret;
 }
 
-static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-	struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
+	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
 	struct btrfs_fs_info *info = root->fs_info;
 	char *compress_type;
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b48f15f101a0..11bd0fc4853f 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -341,11 +341,11 @@ out:
 /**
  * ceph_show_options - Show mount options in /proc/mounts
  * @m: seq_file to write to
- * @mnt: mount descriptor
+ * @root: root of that (sub)tree
  */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int ceph_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
 	struct ceph_mount_options *fsopt = fsc->mount_options;
 	struct ceph_options *opt = fsc->client->options;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0cb89dc6526c..b1fd382d1952 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -343,9 +343,9 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
  * ones are.
  */
 static int
-cifs_show_options(struct seq_file *s, struct vfsmount *m)
+cifs_show_options(struct seq_file *s, struct dentry *root)
 {
-	struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
 	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 	struct sockaddr *srcaddr;
 	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
@@ -430,7 +430,7 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 		seq_printf(s, ",cifsacl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
 		seq_printf(s, ",dynperm");
-	if (m->mnt_sb->s_flags & MS_POSIXACL)
+	if (root->d_sb->s_flags & MS_POSIXACL)
 		seq_printf(s, ",acl");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
 		seq_printf(s, ",mfsymlinks");
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d5d5297efe97..79673eb71151 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -246,9 +246,9 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
 	return err;
 }
 
-static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int devpts_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
+	struct pts_fs_info *fsi = DEVPTS_SB(root->d_sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
 
 	if (opts->setuid)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index da485f0b4d1e..9df7fd6e0c39 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -131,9 +131,9 @@ static void ecryptfs_evict_inode(struct inode *inode)
  * Prints the mount options for a given superblock.
  * Returns zero; does not fail.
  */
-static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int ecryptfs_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct super_block *sb = mnt->mnt_sb;
+	struct super_block *sb = root->d_sb;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
 		&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
 	struct ecryptfs_global_auth_tok *walker;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 67b5e752ec9d..9b403f064ce0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -210,9 +210,9 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(ext2_inode_cachep);
 }
 
-static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int ext2_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct super_block *sb = vfs->mnt_sb;
+	struct super_block *sb = root->d_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = sbi->s_es;
 	unsigned long def_mount_opts;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7e8944ee67c6..3a10b884e1be 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -610,9 +610,9 @@ static char *data_mode_string(unsigned long mode)
  *  - it's set to a non-default value OR
  *  - if the per-sb default is different from the global default
  */
-static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int ext3_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct super_block *sb = vfs->mnt_sb;
+	struct super_block *sb = root->d_sb;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
 	unsigned long def_mount_opts;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b739b210a616..6733b3736b3b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1032,11 +1032,11 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
  *  - it's set to a non-default value OR
  *  - if the per-sb default is different from the global default
  */
-static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
 {
 	int def_errors;
 	unsigned long def_mount_opts;
-	struct super_block *sb = vfs->mnt_sb;
+	struct super_block *sb = root->d_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ef44e5f98ced..7873797cc76a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -671,7 +671,7 @@ int fat_sync_inode(struct inode *inode)
 
 EXPORT_SYMBOL_GPL(fat_sync_inode);
 
-static int fat_show_options(struct seq_file *m, struct vfsmount *mnt);
+static int fat_show_options(struct seq_file *m, struct dentry *root);
 static const struct super_operations fat_sops = {
 	.alloc_inode	= fat_alloc_inode,
 	.destroy_inode	= fat_destroy_inode,
@@ -810,9 +810,9 @@ static const struct export_operations fat_export_ops = {
 	.get_parent	= fat_get_parent,
 };
 
-static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int fat_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(mnt->mnt_sb);
+	struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb);
 	struct fat_mount_options *opts = &sbi->options;
 	int isvfat = opts->isvfat;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3d3622a1ceac..64cf8d07393e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -497,9 +497,10 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
 	return 1;
 }
 
-static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int fuse_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb);
+	struct super_block *sb = root->d_sb;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
 	seq_printf(m, ",user_id=%u", fc->user_id);
 	seq_printf(m, ",group_id=%u", fc->group_id);
@@ -509,9 +510,8 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_puts(m, ",allow_other");
 	if (fc->max_read != ~0)
 		seq_printf(m, ",max_read=%u", fc->max_read);
-	if (mnt->mnt_sb->s_bdev &&
-	    mnt->mnt_sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
-		seq_printf(m, ",blksize=%lu", mnt->mnt_sb->s_blocksize);
+	if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+		seq_printf(m, ",blksize=%lu", sb->s_blocksize);
 	return 0;
 }
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9e89d94be003..10c7733a899b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1284,18 +1284,18 @@ static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
 /**
  * gfs2_show_options - Show mount options for /proc/mounts
  * @s: seq_file structure
- * @mnt: vfsmount
+ * @root: root of this (sub)tree
  *
  * Returns: 0 on success or error code
  */
 
-static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 {
-	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+	struct gfs2_sbd *sdp = root->d_sb->s_fs_info;
 	struct gfs2_args *args = &sdp->sd_args;
 	int val;
 
-	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+	if (is_ancestor(root, sdp->sd_master_dir))
 		seq_printf(s, ",meta");
 	if (args->ar_lockproto[0])
 		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 32dc2fbb26d5..8137fb3e6780 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -133,9 +133,9 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-static int hfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+static int hfs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct hfs_sb_info *sbi = HFS_SB(mnt->mnt_sb);
+	struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
 
 	if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
 		seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 3a6c025414e2..21a5b7fc6db4 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -419,7 +419,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
 int hfsplus_parse_options_remount(char *input, int *force);
 void hfsplus_fill_defaults(struct hfsplus_sb_info *);
-int hfsplus_show_options(struct seq_file *, struct vfsmount *);
+int hfsplus_show_options(struct seq_file *, struct dentry *);
 
 /* super.c */
 struct inode *hfsplus_iget(struct super_block *, unsigned long);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index bb62a5882147..06fa5618600c 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -206,9 +206,9 @@ done:
 	return 1;
 }
 
-int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
+int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
+	struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
 
 	if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
 		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index a7340e710a90..e130bd46d671 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -258,9 +258,9 @@ static void hostfs_destroy_inode(struct inode *inode)
 	call_rcu(&inode->i_rcu, hostfs_i_callback);
 }
 
-static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	const char *root_path = vfs->mnt_sb->s_fs_info;
+	const char *root_path = root->d_sb->s_fs_info;
 	size_t offset = strlen(root_ino) + 1;
 
 	if (strlen(root_path) > offset)
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 804e1292d63e..8be4925296cf 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -96,9 +96,9 @@ static const char *jffs2_compr_name(unsigned int compr)
 	}
 }
 
-static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int jffs2_show_options(struct seq_file *s, struct dentry *root)
 {
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(root->d_sb);
 	struct jffs2_mount_opts *opts = &c->mount_opts;
 
 	if (opts->override_compr)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 1b8f4ca29adf..682bca642f38 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -608,9 +608,9 @@ static int jfs_sync_fs(struct super_block *sb, int wait)
 	return 0;
 }
 
-static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int jfs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct jfs_sb_info *sbi = JFS_SBI(vfs->mnt_sb);
+	struct jfs_sb_info *sbi = JFS_SBI(root->d_sb);
 
 	if (sbi->uid != -1)
 		seq_printf(seq, ",uid=%d", sbi->uid);
diff --git a/fs/namespace.c b/fs/namespace.c
index 773435ca300d..db65e2e4921f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -836,12 +836,12 @@ static inline void mangle(struct seq_file *m, const char *s)
  *
  * See also save_mount_options().
  */
-int generic_show_options(struct seq_file *m, struct vfsmount *mnt)
+int generic_show_options(struct seq_file *m, struct dentry *root)
 {
 	const char *options;
 
 	rcu_read_lock();
-	options = rcu_dereference(mnt->mnt_sb->s_options);
+	options = rcu_dereference(root->d_sb->s_options);
 
 	if (options != NULL && options[0]) {
 		seq_putc(m, ',');
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index f3f07cd392b3..3d1e34f8a68e 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -44,7 +44,7 @@
 static void ncp_evict_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
 static int  ncp_statfs(struct dentry *, struct kstatfs *);
-static int  ncp_show_options(struct seq_file *, struct vfsmount *);
+static int  ncp_show_options(struct seq_file *, struct dentry *);
 
 static struct kmem_cache * ncp_inode_cachep;
 
@@ -322,9 +322,9 @@ static void ncp_stop_tasks(struct ncp_server *server) {
 		flush_work_sync(&server->timeout_tq);
 }
 
-static int  ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
+static int  ncp_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct ncp_server *server = NCP_SBP(mnt->mnt_sb);
+	struct ncp_server *server = NCP_SBP(root->d_sb);
 	unsigned int tmp;
 
 	if (server->m.uid != 0)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 02c693c77ab7..e463967aafb8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -262,7 +262,7 @@ static match_table_t nfs_local_lock_tokens = {
 
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
-static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_options(struct seq_file *, struct dentry *);
 static int  nfs_show_devname(struct seq_file *, struct dentry *);
 static int  nfs_show_path(struct seq_file *, struct dentry *);
 static int  nfs_show_stats(struct seq_file *, struct dentry *);
@@ -720,9 +720,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 /*
  * Describe the mount options on this VFS mountpoint
  */
-static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+static int nfs_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+	struct nfs_server *nfss = NFS_SB(root->d_sb);
 
 	nfs_show_mount_options(m, nfss, 0);
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 5356c7169d50..08e3d4f9df18 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -648,11 +648,11 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-	struct super_block *sb = vfs->mnt_sb;
+	struct super_block *sb = dentry->d_sb;
 	struct the_nilfs *nilfs = sb->s_fs_info;
-	struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
+	struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
 
 	if (!nilfs_test_opt(nilfs, BARRIER))
 		seq_puts(seq, ",nobarrier");
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index fea40bb6fb68..2eaa66652944 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2300,16 +2300,16 @@ void ntfs_evict_big_inode(struct inode *vi)
 /**
  * ntfs_show_options - show mount options in /proc/mounts
  * @sf:		seq_file in which to write our mount options
- * @mnt:	vfs mount whose mount options to display
+ * @root:	root of the mounted tree whose mount options to display
  *
  * Called by the VFS once for each mounted ntfs volume when someone reads
  * /proc/mounts in order to display the NTFS specific mount options of each
- * mount. The mount options of the vfs mount @mnt are written to the seq file
+ * mount. The mount options of fs specified by @root are written to the seq file
  * @sf and success is returned.
  */
-int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
+int ntfs_show_options(struct seq_file *sf, struct dentry *root)
 {
-	ntfs_volume *vol = NTFS_SB(mnt->mnt_sb);
+	ntfs_volume *vol = NTFS_SB(root->d_sb);
 	int i;
 
 	seq_printf(sf, ",uid=%i", vol->uid);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index fe8e7e928889..db29695f845c 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -298,7 +298,7 @@ extern void ntfs_clear_extent_inode(ntfs_inode *ni);
 
 extern int ntfs_read_inode_mount(struct inode *vi);
 
-extern int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt);
+extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
 
 #ifdef NTFS_RW
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c05ff25c356c..604e12c4e979 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -108,7 +108,7 @@ static int ocfs2_parse_options(struct super_block *sb, char *options,
 			       int is_remount);
 static int ocfs2_check_set_options(struct super_block *sb,
 				   struct mount_options *options);
-static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root);
 static void ocfs2_put_super(struct super_block *sb);
 static int ocfs2_mount_volume(struct super_block *sb);
 static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -1533,9 +1533,9 @@ bail:
 	return status;
 }
 
-static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
 {
-	struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
+	struct ocfs2_super *osb = OCFS2_SB(root->d_sb);
 	unsigned long opts = osb->s_mount_opt;
 	unsigned int local_alloc_megs;
 
@@ -1567,8 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (osb->preferred_slot != OCFS2_INVALID_SLOT)
 		seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
 
-	if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
-		seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+	seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
 
 	if (osb->osb_commit_interval)
 		seq_printf(s, ",commit=%u",
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 8f8304b3f98a..12412852d88a 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -113,7 +113,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
 		goto out;
 	show_mnt_opts(m, mnt);
 	if (sb->s_op->show_options)
-		err = sb->s_op->show_options(m, mnt);
+		err = sb->s_op->show_options(m, mnt_path.dentry);
 	seq_puts(m, " 0 0\n");
 out:
 	return err;
@@ -174,7 +174,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 	if (err)
 		goto out;
 	if (sb->s_op->show_options)
-		err = sb->s_op->show_options(m, mnt);
+		err = sb->s_op->show_options(m, mnt->mnt_root);
 	seq_putc(m, '\n');
 out:
 	return err;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d93a3fadf53c..63765d58445b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -419,9 +419,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int ubifs_show_options(struct seq_file *s, struct dentry *root)
 {
-	struct ubifs_info *c = mnt->mnt_sb->s_fs_info;
+	struct ubifs_info *c = root->d_sb->s_fs_info;
 
 	if (c->mount_opts.unmount_mode == 2)
 		seq_printf(s, ",fast_unmount");
diff --git a/fs/udf/super.c b/fs/udf/super.c
index c94fc889a486..0c33225647a0 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -89,7 +89,7 @@ static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct dentry *, struct kstatfs *);
-static int udf_show_options(struct seq_file *, struct vfsmount *);
+static int udf_show_options(struct seq_file *, struct dentry *);
 
 struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
 {
@@ -249,9 +249,9 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
 	return 0;
 }
 
-static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
+static int udf_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct super_block *sb = mnt->mnt_sb;
+	struct super_block *sb = root->d_sb;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 
 	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index d6961eb5b774..5246ee3e5607 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1351,9 +1351,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	return 0;
 }
 
-static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int ufs_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
+	struct ufs_sb_info *sbi = UFS_SB(root->d_sb);
 	unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
 	const struct match_token *tp = tokens;
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8a899496fd5f..7b7669507ee3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1238,9 +1238,9 @@ xfs_fs_unfreeze(
 STATIC int
 xfs_fs_show_options(
 	struct seq_file		*m,
-	struct vfsmount		*mnt)
+	struct dentry		*root)
 {
-	return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
+	return -xfs_showargs(XFS_M(root->d_sb), m);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 13721b073407..cc1021fd19ef 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1672,7 +1672,7 @@ struct super_operations {
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*umount_begin) (struct super_block *);
 
-	int (*show_options)(struct seq_file *, struct vfsmount *);
+	int (*show_options)(struct seq_file *, struct dentry *);
 	int (*show_devname)(struct seq_file *, struct dentry *);
 	int (*show_path)(struct seq_file *, struct dentry *);
 	int (*show_stats)(struct seq_file *, struct dentry *);
@@ -2592,7 +2592,7 @@ extern void setattr_copy(struct inode *inode, const struct iattr *attr);
 
 extern void file_update_time(struct file *file);
 
-extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt);
+extern int generic_show_options(struct seq_file *m, struct dentry *root);
 extern void save_mount_options(struct super_block *sb, char *options);
 extern void replace_mount_options(struct super_block *sb, char *options);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86ebacfd9431..7cab65f83f1d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1038,9 +1038,9 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	return 0;
 }
 
-static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
+	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
 	struct cgroup_subsys *ss;
 
 	mutex_lock(&cgroup_mutex);
diff --git a/mm/shmem.c b/mm/shmem.c
index 86a19efc36fb..feead1943d92 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2118,9 +2118,9 @@ out:
 	return error;
 }
 
-static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 {
-	struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
 
 	if (sbinfo->max_blocks != shmem_default_max_blocks())
 		seq_printf(seq, ",size=%luk",
-- 
cgit v1.2.3-71-gd317


From a0e86bd4252519321b0d102dc4ed90557aa7bee9 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sun, 8 Jan 2012 22:44:29 +0100
Subject: audit: always follow va_copy() with va_end()

A call to va_copy() should always be followed by a call to va_end() in
the same function.  In kernel/autit.c::audit_log_vformat() this is not
always done.  This patch makes sure va_end() is always called.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 09fae2677a45..2c1d6ab7106e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 		avail = audit_expand(ab,
 			max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
 		if (!avail)
-			goto out;
+			goto out_va_end;
 		len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
 	}
-	va_end(args2);
 	if (len > 0)
 		skb_put(skb, len);
+out_va_end:
+	va_end(args2);
 out:
 	return;
 }
-- 
cgit v1.2.3-71-gd317


From 9b9fb610f6800e0db46cccd8618dd7e609c9bb5a Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Tue, 10 Jan 2012 09:24:05 +0900
Subject: sched: Remove empty #ifdefs

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4F0B8525.8070901@ct.jp.nec.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4dbfd04a2148..457c881873cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7136,10 +7136,6 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #endif
 
-#ifdef CONFIG_RT_GROUP_SCHED
-#else /* !CONFIG_RT_GROUP_SCHED */
-#endif /* CONFIG_RT_GROUP_SCHED */
-
 #ifdef CONFIG_CGROUP_SCHED
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
@@ -7248,9 +7244,6 @@ void sched_move_task(struct task_struct *tsk)
 }
 #endif /* CONFIG_CGROUP_SCHED */
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#endif
-
 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
-- 
cgit v1.2.3-71-gd317


From c6968e73b90c2a2fb9a32d4bad249f8f70f70125 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 10 Jan 2012 15:07:31 -0800
Subject: PM/Hibernate: do not count debug pages as savable

When debugging with CONFIG_DEBUG_PAGEALLOC and debug_guardpage_minorder >
0, we have lot of free pages that are not marked so.  Snapshot code
account them as savable, what cause hibernate memory preallocation
failure.

It is pretty hard to make hibernate allocation succeed with
debug_guardpage_minorder=1.  This change at least make it possible when
system has relatively big amount of RAM.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cbe2c1441392..1cf88900ec4f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 	    PageReserved(page))
 		return NULL;
 
+	if (page_is_guard(page))
+		return NULL;
+
 	return page;
 }
 
@@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 	    && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
 		return NULL;
 
+	if (page_is_guard(page))
+		return NULL;
+
 	return page;
 }
 
-- 
cgit v1.2.3-71-gd317


From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 10 Jan 2012 15:08:09 -0800
Subject: tracepoint: add tracepoints for debugging oom_score_adj

oom_score_adj is used for guarding processes from OOM-Killer.  One of
problem is that it's inherited at fork().  When a daemon set oom_score_adj
and make children, it's hard to know where the value is set.

This patch adds some tracepoints useful for debugging. This patch adds
3 trace points.
  - creating new task
  - renaming a task (exec)
  - set oom_score_adj

To debug, users need to enable some trace pointer. Maybe filtering is useful as

# EVENT=/sys/kernel/debug/tracing/events/task/
# echo "oom_score_adj != 0" > $EVENT/task_newtask/filter
# echo "oom_score_adj != 0" > $EVENT/task_rename/filter
# echo 1 > $EVENT/enable
# EVENT=/sys/kernel/debug/tracing/events/oom/
# echo 1 > $EVENT/enable

output will be like this.
# grep oom /sys/kernel/debug/tracing/trace
bash-7699  [007] d..3  5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000
bash-7699  [007] ...1  5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000
ls-7729  [003] ...2  5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000
bash-7699  [002] ...1  5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000
grep-7730  [007] ...2  5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                   |  4 +++
 fs/proc/base.c              |  3 +++
 include/trace/events/oom.h  | 33 ++++++++++++++++++++++++
 include/trace/events/task.h | 61 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c               |  6 +++++
 mm/oom_kill.c               |  6 +++++
 6 files changed, 113 insertions(+)
 create mode 100644 include/trace/events/oom.h
 create mode 100644 include/trace/events/task.h

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 3f64b9f26e7d..aeb135c7ff5c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,6 +59,8 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+
+#include <trace/events/task.h>
 #include "internal.h"
 
 int core_uses_pid;
@@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 {
 	task_lock(tsk);
 
+	trace_task_rename(tsk, buf);
+
 	/*
 	 * Threads may access current->comm without holding
 	 * the task lock, so write the string carefully.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1dddda999f2..1aab5fe05a1b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -86,6 +86,7 @@
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
+#include <trace/events/oom.h>
 #include "internal.h"
 
 /* NOTE:
@@ -1010,6 +1011,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	else
 		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 								-OOM_DISABLE;
+	trace_oom_score_adj_update(task);
 err_sighand:
 	unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -1097,6 +1099,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	task->signal->oom_score_adj = oom_score_adj;
 	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
 		task->signal->oom_score_adj_min = oom_score_adj;
+	trace_oom_score_adj_update(task);
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
 	 * always attainable.
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
new file mode 100644
index 000000000000..dd4ba3b92002
--- /dev/null
+++ b/include/trace/events/oom.h
@@ -0,0 +1,33 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM oom
+
+#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OOM_H
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(oom_score_adj_update,
+
+	TP_PROTO(struct task_struct *task),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__field(	pid_t,	pid)
+		__array(	char,	comm,	TASK_COMM_LEN )
+		__field(	 int,	oom_score_adj)
+	),
+
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__entry->oom_score_adj = task->signal->oom_score_adj;
+	),
+
+	TP_printk("pid=%d comm=%s oom_score_adj=%d",
+		__entry->pid, __entry->comm, __entry->oom_score_adj)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/task.h b/include/trace/events/task.h
new file mode 100644
index 000000000000..b53add02e929
--- /dev/null
+++ b/include/trace/events/task.h
@@ -0,0 +1,61 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM task
+
+#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TASK_H
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(task_newtask,
+
+	TP_PROTO(struct task_struct *task, unsigned long clone_flags),
+
+	TP_ARGS(task, clone_flags),
+
+	TP_STRUCT__entry(
+		__field(	pid_t,	pid)
+		__array(	char,	comm, TASK_COMM_LEN)
+		__field( unsigned long, clone_flags)
+		__field(	int,    oom_score_adj)
+	),
+
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__entry->clone_flags = clone_flags;
+		__entry->oom_score_adj = task->signal->oom_score_adj;
+	),
+
+	TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d",
+		__entry->pid, __entry->comm,
+		__entry->clone_flags, __entry->oom_score_adj)
+);
+
+TRACE_EVENT(task_rename,
+
+	TP_PROTO(struct task_struct *task, char *comm),
+
+	TP_ARGS(task, comm),
+
+	TP_STRUCT__entry(
+		__field(	pid_t,	pid)
+		__array(	char, oldcomm,  TASK_COMM_LEN)
+		__array(	char, newcomm,  TASK_COMM_LEN)
+		__field(	int, oom_score_adj)
+	),
+
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN);
+		memcpy(entry->newcomm, comm, TASK_COMM_LEN);
+		__entry->oom_score_adj = task->signal->oom_score_adj;
+	),
+
+	TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d",
+		__entry->pid, __entry->oldcomm,
+		__entry->newcomm, __entry->oom_score_adj)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index b00711ce7c13..5e1391b5ade0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,6 +76,9 @@
 
 #include <trace/events/sched.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -1370,6 +1373,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
 	perf_event_fork(p);
+
+	trace_task_newtask(p, clone_flags);
+
 	return p;
 
 bad_fork_free_pid:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eeb27e27dce3..7c122faa05c5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -33,6 +33,10 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/freezer.h>
+#include <linux/ftrace.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/oom.h>
 
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
@@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val)
 	spin_lock_irq(&sighand->siglock);
 	if (current->signal->oom_score_adj == old_val)
 		current->signal->oom_score_adj = new_val;
+	trace_oom_score_adj_update(current);
 	spin_unlock_irq(&sighand->siglock);
 }
 
@@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val)
 	spin_lock_irq(&sighand->siglock);
 	old_val = current->signal->oom_score_adj;
 	current->signal->oom_score_adj = new_val;
+	trace_oom_score_adj_update(current);
 	spin_unlock_irq(&sighand->siglock);
 
 	return old_val;
-- 
cgit v1.2.3-71-gd317


From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 10 Jan 2012 15:11:17 -0800
Subject: signal: add block_sigmask() for adding sigmask to current->blocked

Abstract the code sequence for adding a signal handler's sa_mask to
current->blocked because the sequence is identical for all architectures.
Furthermore, in the past some architectures actually got this code wrong,
so introduce a wrapper that all architectures can use.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/signal.c |  6 +-----
 include/linux/signal.h   |  1 +
 kernel/signal.c          | 21 +++++++++++++++++++++
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 54ddaeb221c1..46a01bdc27e2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -682,7 +682,6 @@ static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs *regs)
 {
-	sigset_t blocked;
 	int ret;
 
 	/* Are we from a system call? */
@@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 */
 	regs->flags &= ~X86_EFLAGS_TF;
 
-	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
-	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&blocked, sig);
-	set_current_blocked(&blocked);
+	block_sigmask(ka, sig);
 
 	tracehook_signal_handler(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
diff --git a/include/linux/signal.h b/include/linux/signal.h
index a822300a253b..7987ce74874b 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -254,6 +254,7 @@ extern void set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
 
 extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
+extern void block_sigmask(struct k_sigaction *ka, int signr);
 extern void exit_signals(struct task_struct *tsk);
 
 extern struct kmem_cache *sighand_cachep;
diff --git a/kernel/signal.c b/kernel/signal.c
index bb0efa5705ed..d532f1709fbf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2318,6 +2318,27 @@ relock:
 	return signr;
 }
 
+/**
+ * block_sigmask - add @ka's signal mask to current->blocked
+ * @ka: action for @signr
+ * @signr: signal that has been successfully delivered
+ *
+ * This function should be called when a signal has succesfully been
+ * delivered. It adds the mask of signals for @ka to current->blocked
+ * so that they are blocked during the execution of the signal
+ * handler. In addition, @signr will be blocked unless %SA_NODEFER is
+ * set in @ka->sa.sa_flags.
+ */
+void block_sigmask(struct k_sigaction *ka, int signr)
+{
+	sigset_t blocked;
+
+	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
+	if (!(ka->sa.sa_flags & SA_NODEFER))
+		sigaddset(&blocked, signr);
+	set_current_blocked(&blocked);
+}
+
 /*
  * It could be that complete_signal() picked us to notify about the
  * group-wide signal. Other threads should be notified now to take
-- 
cgit v1.2.3-71-gd317


From b196be89cdc14a88cc637cdad845a75c5886c82d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Jan 2012 15:11:35 -0800
Subject: workqueue: make alloc_workqueue() take printf fmt and args for name

alloc_workqueue() currently expects the passed in @name pointer to remain
accessible.  This is inconvenient and a bit silly given that the whole wq
is being dynamically allocated.  This patch updates alloc_workqueue() and
friends to take printf format string instead of opaque string and matching
varargs at the end.  The name is allocated together with the wq and
formatted.

alloc_ordered_workqueue() is converted to a macro to unify varargs
handling with alloc_workqueue(), and, while at it, add comment to
alloc_workqueue().

None of the current in-kernel users pass in string with '%' as constant
name and this change shouldn't cause any problem.

[akpm@linux-foundation.org: use __printf]
Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 47 +++++++++++++++++++++++++++++++----------------
 kernel/workqueue.c        | 32 ++++++++++++++++++++++----------
 2 files changed, 53 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0d556deb497b..eb8b9f15f2e0 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -297,32 +297,50 @@ extern struct workqueue_struct *system_unbound_wq;
 extern struct workqueue_struct *system_freezable_wq;
 
 extern struct workqueue_struct *
-__alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
-		      struct lock_class_key *key, const char *lock_name);
+__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
+	struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6);
 
+/**
+ * alloc_workqueue - allocate a workqueue
+ * @fmt: printf format for the name of the workqueue
+ * @flags: WQ_* flags
+ * @max_active: max in-flight work items, 0 for default
+ * @args: args for @fmt
+ *
+ * Allocate a workqueue with the specified parameters.  For detailed
+ * information on WQ_* flags, please refer to Documentation/workqueue.txt.
+ *
+ * The __lock_name macro dance is to guarantee that single lock_class_key
+ * doesn't end up with different namesm, which isn't allowed by lockdep.
+ *
+ * RETURNS:
+ * Pointer to the allocated workqueue on success, %NULL on failure.
+ */
 #ifdef CONFIG_LOCKDEP
-#define alloc_workqueue(name, flags, max_active)		\
+#define alloc_workqueue(fmt, flags, max_active, args...)	\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
 								\
-	if (__builtin_constant_p(name))				\
-		__lock_name = (name);				\
+	if (__builtin_constant_p(fmt))				\
+		__lock_name = (fmt);				\
 	else							\
-		__lock_name = #name;				\
+		__lock_name = #fmt;				\
 								\
-	__alloc_workqueue_key((name), (flags), (max_active),	\
-			      &__key, __lock_name);		\
+	__alloc_workqueue_key((fmt), (flags), (max_active),	\
+			      &__key, __lock_name, ##args);	\
 })
 #else
-#define alloc_workqueue(name, flags, max_active)		\
-	__alloc_workqueue_key((name), (flags), (max_active), NULL, NULL)
+#define alloc_workqueue(fmt, flags, max_active, args...)	\
+	__alloc_workqueue_key((fmt), (flags), (max_active),	\
+			      NULL, NULL, ##args)
 #endif
 
 /**
  * alloc_ordered_workqueue - allocate an ordered workqueue
- * @name: name of the workqueue
+ * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
+ * @args: args for @fmt
  *
  * Allocate an ordered workqueue.  An ordered workqueue executes at
  * most one work item at any given time in the queued order.  They are
@@ -331,11 +349,8 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
  * RETURNS:
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
-static inline struct workqueue_struct *
-alloc_ordered_workqueue(const char *name, unsigned int flags)
-{
-	return alloc_workqueue(name, WQ_UNBOUND | flags, 1);
-}
+#define alloc_ordered_workqueue(fmt, flags, args...)		\
+	alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args)
 
 #define create_workqueue(name)					\
 	alloc_workqueue((name), WQ_MEM_RECLAIM, 1)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42fa9ad0a810..bec7b5b53e03 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -242,10 +242,10 @@ struct workqueue_struct {
 
 	int			nr_drainers;	/* W: drain in progress */
 	int			saved_max_active; /* W: saved cwq max_active */
-	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
 #endif
+	char			name[];		/* I: workqueue name */
 };
 
 struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
 	return clamp_val(max_active, 1, lim);
 }
 
-struct workqueue_struct *__alloc_workqueue_key(const char *name,
+struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 					       unsigned int flags,
 					       int max_active,
 					       struct lock_class_key *key,
-					       const char *lock_name)
+					       const char *lock_name, ...)
 {
+	va_list args, args1;
 	struct workqueue_struct *wq;
 	unsigned int cpu;
+	size_t namelen;
+
+	/* determine namelen, allocate wq and format name */
+	va_start(args, lock_name);
+	va_copy(args1, args);
+	namelen = vsnprintf(NULL, 0, fmt, args) + 1;
+
+	wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
+	if (!wq)
+		goto err;
+
+	vsnprintf(wq->name, namelen, fmt, args1);
+	va_end(args);
+	va_end(args1);
 
 	/*
 	 * Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 		flags |= WQ_HIGHPRI;
 
 	max_active = max_active ?: WQ_DFL_ACTIVE;
-	max_active = wq_clamp_max_active(max_active, flags, name);
-
-	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
-	if (!wq)
-		goto err;
+	max_active = wq_clamp_max_active(max_active, flags, wq->name);
 
+	/* init wq */
 	wq->flags = flags;
 	wq->saved_max_active = max_active;
 	mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	INIT_LIST_HEAD(&wq->flusher_queue);
 	INIT_LIST_HEAD(&wq->flusher_overflow);
 
-	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
 
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 		if (!rescuer)
 			goto err;
 
-		rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+		rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+					       wq->name);
 		if (IS_ERR(rescuer->task))
 			goto err;
 
-- 
cgit v1.2.3-71-gd317


From 6b550f9495947fc279d12c38feaf98500e8d0646 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Tue, 10 Jan 2012 15:11:37 -0800
Subject: user namespace: make signal.c respect user namespaces

ipc/mqueue.c: for __SI_MESQ, convert the uid being sent to recipient's
user namespace. (new, thanks Oleg)

__send_signal: convert current's uid to the recipient's user namespace
for any siginfo which is not SI_FROMKERNEL (patch from Oleg, thanks
again :)

do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's
user namespace

ptrace_signal maps parent's uid into current's user namespace before
including in signal to current.  IIUC Oleg has argued that this shouldn't
matter as the debugger will play with it, but it seems like not converting
the value currently being set is misleading.

Changelog:
Sep 20: Inspired by Oleg's suggestion, define map_cred_ns() helper to
	simplify callers and help make clear what we are translating
        (which uid into which namespace).  Passing the target task would
	make callers even easier to read, but we pass in user_ns because
	current_user_ns() != task_cred_xxx(current, user_ns).
Sep 20: As recommended by Oleg, also put task_pid_vnr() under rcu_read_lock
	in ptrace_signal().
Sep 23: In send_signal(), detect when (user) signal is coming from an
	ancestor or unrelated user namespace.  Pass that on to __send_signal,
	which sets si_uid to 0 or overflowuid if needed.
Oct 12: Base on Oleg's fixup_uid() patch.  On top of that, handle all
	SI_FROMKERNEL cases at callers, because we can't assume sender is
	current in those cases.
Nov 10: (mhelsley) rename fixup_uid to more meaningful usern_fixup_signal_uid
Nov 10: (akpm) make the !CONFIG_USER_NS case clearer

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
From: Serge Hallyn <serge.hallyn@canonical.com>
Subject: __send_signal: pass q->info, not info, to userns_fixup_signal_uid (v2)

Eric Biederman pointed out that passing info is a bug and could lead to a
NULL pointer deref to boot.

A collection of signal, securebits, filecaps, cap_bounds, and a few other
ltp tests passed with this kernel.

Changelog:
    Nov 18: previous patch missed a leading '&'

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
From: Dan Carpenter <dan.carpenter@oracle.com>
Subject: ipc/mqueue: lock() => unlock() typo

There was a double lock typo introduced in b085f4bd6b21 "user namespace:
make signal.c respect user namespaces"

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/mqueue.c    |  7 ++++++-
 kernel/signal.c | 43 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 46 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 9a142a290749..9b7c8ab7d75c 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -32,6 +32,7 @@
 #include <linux/nsproxy.h>
 #include <linux/pid.h>
 #include <linux/ipc_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/slab.h>
 
 #include <net/sock.h>
@@ -542,9 +543,13 @@ static void __do_notify(struct mqueue_inode_info *info)
 			sig_i.si_errno = 0;
 			sig_i.si_code = SI_MESGQ;
 			sig_i.si_value = info->notify.sigev_value;
+			/* map current pid/uid into info->owner's namespaces */
+			rcu_read_lock();
 			sig_i.si_pid = task_tgid_nr_ns(current,
 						ns_of_pid(info->notify_owner));
-			sig_i.si_uid = current_uid();
+			sig_i.si_uid = user_ns_map_uid(info->user->user_ns,
+						current_cred(), current_uid());
+			rcu_read_unlock();
 
 			kill_pid_info(info->notify.sigev_signo,
 				      &sig_i, info->notify_owner);
diff --git a/kernel/signal.c b/kernel/signal.c
index d532f1709fbf..c73c4284160e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,6 +28,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
 
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
+/*
+ * map the uid in struct cred into user namespace *ns
+ */
+static inline uid_t map_cred_ns(const struct cred *cred,
+				struct user_namespace *ns)
+{
+	return user_ns_map_uid(ns, cred, cred->uid);
+}
+
+#ifdef CONFIG_USER_NS
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+	if (current_user_ns() == task_cred_xxx(t, user_ns))
+		return;
+
+	if (SI_FROMKERNEL(info))
+		return;
+
+	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
+					current_cred(), info->si_uid);
+}
+#else
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+	return;
+}
+#endif
+
 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			int group, int from_ancestor_ns)
 {
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 				q->info.si_pid = 0;
 			break;
 		}
+
+		userns_fixup_signal_uid(&q->info, t);
+
 	} else if (!is_si_special(info)) {
 		if (sig >= SIGRTMIN && info->si_code != SI_USER) {
 			/*
@@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
-	info.si_uid = __task_cred(tsk)->uid;
+	info.si_uid = map_cred_ns(__task_cred(tsk),
+			task_cred_xxx(tsk->parent, user_ns));
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-	info.si_uid = __task_cred(tsk)->uid;
+	info.si_uid = map_cred_ns(__task_cred(tsk),
+			task_cred_xxx(parent, user_ns));
 	rcu_read_unlock();
 
 	info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
 		info->si_signo = signr;
 		info->si_errno = 0;
 		info->si_code = SI_USER;
+		rcu_read_lock();
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = task_uid(current->parent);
+		info->si_uid = map_cred_ns(__task_cred(current->parent),
+				current_user_ns());
+		rcu_read_unlock();
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
-- 
cgit v1.2.3-71-gd317


From bced76aeaca03b45e3b4bdb868cada328e497847 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 11 Jan 2012 13:11:12 +0100
Subject: sched: Fix lockup by limiting load-balance retries on lock-break

Eric and David reported dead machines and traced it to commit
a195f004 ("sched: Fix load-balance lock-breaking"), it turns out
there's still a scenario where we can end up re-trying forever.

Since there is no strict forward progress guarantee in the
load-balance iteration we can get stuck re-retrying the same
task-set over and over.

Creating a forward progress guarantee with the existing
structure is somewhat non-trivial, for now simply terminate the
retry loop after a few tries.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: David Ahern <dsahern@gmail.com>
[ logic cleanup as suggested by Eric ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Link: http://lkml.kernel.org/r/1326297936.2442.157.camel@twins
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e42de9105f8..84adb2d66cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3130,8 +3130,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 }
 
 #define LBF_ALL_PINNED	0x01
-#define LBF_NEED_BREAK	0x02
-#define LBF_ABORT	0x04
+#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */
+#define LBF_HAD_BREAK	0x04
+#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT	0x10
 
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
@@ -4508,7 +4510,9 @@ redo:
 			goto out_balanced;
 
 		if (lb_flags & LBF_NEED_BREAK) {
-			lb_flags &= ~LBF_NEED_BREAK;
+			lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+			if (lb_flags & LBF_ABORT)
+				goto out_balanced;
 			goto redo;
 		}
 
-- 
cgit v1.2.3-71-gd317


From 70b1e9161e903a9e1682aca3a832ed29ef876a4d Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@gmail.com>
Date: Sat, 12 Nov 2011 19:08:55 -0800
Subject: module: Add comments describing how the "strmap" logic works

Signed-off-by: Kevin Cernekee <cernekee@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 178333c48d1e..cf9f1b6b3268 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2193,6 +2193,13 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 
 	src = (void *)info->hdr + symsect->sh_offset;
 	nsrc = symsect->sh_size / sizeof(*src);
+
+	/*
+	 * info->strmap has a '1' bit for each byte of .strtab we want to
+	 * keep resident in mod->core_strtab.  Everything else in .strtab
+	 * is unreferenced by the symbols in mod->core_symtab, and will be
+	 * discarded when add_kallsyms() compacts the string table.
+	 */
 	for (ndst = i = 1; i < nsrc; ++i, ++src)
 		if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
 			unsigned int j = src->st_name;
@@ -2215,6 +2222,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 
 	/* Append room for core symbols' strings at end of core part. */
 	info->stroffs = mod->core_size;
+
+	/* First strtab byte (and first symtab entry) are zeroes. */
 	__set_bit(0, info->strmap);
 	mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
 }
-- 
cgit v1.2.3-71-gd317


From 48fd11880b5ef04270be8a87d9a9a9ee2fdae338 Mon Sep 17 00:00:00 2001
From: Kevin Cernekee <cernekee@gmail.com>
Date: Fri, 13 Jan 2012 09:32:14 +1030
Subject: module: Fix performance regression on modules with large symbol
 tables

Looking at /proc/kallsyms, one starts to ponder whether all of the extra
strtab-related complexity in module.c is worth the memory savings.

Instead of making the add_kallsyms() loop even more complex, I tried the
other route of deleting the strmap logic and naively copying each string
into core_strtab with no consideration for consolidating duplicates.

Performance on an "already exists" insmod of nvidia.ko (runs
add_kallsyms() but does not actually initialize the module):

	Original scheme: 1.230s
	With naive copying: 0.058s

Extra space used: 35k (of a 408k module).

Signed-off-by: Kevin Cernekee <cernekee@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <73defb5e4bca04a6431392cc341112b1@localhost>
---
 kernel/module.c | 65 +++++++++++++++++++--------------------------------------
 1 file changed, 21 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index cf9f1b6b3268..4928cffc3dcc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -138,7 +138,6 @@ struct load_info {
 	unsigned long len;
 	Elf_Shdr *sechdrs;
 	char *secstrings, *strtab;
-	unsigned long *strmap;
 	unsigned long symoffs, stroffs;
 	struct _ddebug *debug;
 	unsigned int num_debug;
@@ -2178,12 +2177,19 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
 	return true;
 }
 
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep.  This is simple, but has the effect of making multiple
+ * copies of duplicates.  We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
 static void layout_symtab(struct module *mod, struct load_info *info)
 {
 	Elf_Shdr *symsect = info->sechdrs + info->index.sym;
 	Elf_Shdr *strsect = info->sechdrs + info->index.str;
 	const Elf_Sym *src;
-	unsigned int i, nsrc, ndst;
+	unsigned int i, nsrc, ndst, strtab_size;
 
 	/* Put symbol section at end of init part of module. */
 	symsect->sh_flags |= SHF_ALLOC;
@@ -2194,38 +2200,23 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	src = (void *)info->hdr + symsect->sh_offset;
 	nsrc = symsect->sh_size / sizeof(*src);
 
-	/*
-	 * info->strmap has a '1' bit for each byte of .strtab we want to
-	 * keep resident in mod->core_strtab.  Everything else in .strtab
-	 * is unreferenced by the symbols in mod->core_symtab, and will be
-	 * discarded when add_kallsyms() compacts the string table.
-	 */
-	for (ndst = i = 1; i < nsrc; ++i, ++src)
+	/* Compute total space required for the core symbols' strtab. */
+	for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
 		if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
-			unsigned int j = src->st_name;
-
-			while (!__test_and_set_bit(j, info->strmap)
-			       && info->strtab[j])
-				++j;
-			++ndst;
+			strtab_size += strlen(&info->strtab[src->st_name]) + 1;
+			ndst++;
 		}
 
 	/* Append room for core symbols at end of core part. */
 	info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
-	mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+	info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+	mod->core_size += strtab_size;
 
 	/* Put string table section at end of init part of module. */
 	strsect->sh_flags |= SHF_ALLOC;
 	strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
 					 info->index.str) | INIT_OFFSET_MASK;
 	DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
-
-	/* Append room for core symbols' strings at end of core part. */
-	info->stroffs = mod->core_size;
-
-	/* First strtab byte (and first symtab entry) are zeroes. */
-	__set_bit(0, info->strmap);
-	mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
 }
 
 static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2246,22 +2237,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 		mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
 
 	mod->core_symtab = dst = mod->module_core + info->symoffs;
+	mod->core_strtab = s = mod->module_core + info->stroffs;
 	src = mod->symtab;
 	*dst = *src;
+	*s++ = 0;
 	for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
 		if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
 			continue;
+
 		dst[ndst] = *src;
-		dst[ndst].st_name = bitmap_weight(info->strmap,
-						  dst[ndst].st_name);
-		++ndst;
+		dst[ndst++].st_name = s - mod->core_strtab;
+		s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
 	}
 	mod->core_num_syms = ndst;
-
-	mod->core_strtab = s = mod->module_core + info->stroffs;
-	for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
-		if (test_bit(i, info->strmap))
-			*++s = mod->strtab[i];
 }
 #else
 static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2751,27 +2739,18 @@ static struct module *layout_and_allocate(struct load_info *info)
 	   this is done generically; there doesn't appear to be any
 	   special cases for the architectures. */
 	layout_sections(mod, info);
-
-	info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
-			 * sizeof(long), GFP_KERNEL);
-	if (!info->strmap) {
-		err = -ENOMEM;
-		goto free_percpu;
-	}
 	layout_symtab(mod, info);
 
 	/* Allocate and move to the final place */
 	err = move_module(mod, info);
 	if (err)
-		goto free_strmap;
+		goto free_percpu;
 
 	/* Module has been copied to its final place now: return it. */
 	mod = (void *)info->sechdrs[info->index.mod].sh_addr;
 	kmemleak_load_module(mod, info);
 	return mod;
 
-free_strmap:
-	kfree(info->strmap);
 free_percpu:
 	percpu_modfree(mod);
 out:
@@ -2781,7 +2760,6 @@ out:
 /* mod is no longer valid after this! */
 static void module_deallocate(struct module *mod, struct load_info *info)
 {
-	kfree(info->strmap);
 	percpu_modfree(mod);
 	module_free(mod, mod->module_init);
 	module_free(mod, mod->module_core);
@@ -2911,8 +2889,7 @@ static struct module *load_module(void __user *umod,
 	if (err < 0)
 		goto unlink;
 
-	/* Get rid of temporary copy and strmap. */
-	kfree(info.strmap);
+	/* Get rid of temporary copy. */
 	free_copy(&info);
 
 	/* Done! */
-- 
cgit v1.2.3-71-gd317


From bd77c04772da38fca510c81f78e51f727123b919 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 13 Jan 2012 09:32:14 +1030
Subject: module: struct module_ref should contains long fields

module_ref contains two "unsigned int" fields.

Thats now too small, since some machines can open more than 2^32 files.

Check commit 518de9b39e8 (fs: allow for more than 2^31 files) for
reference.

We can add an aligned(2 * sizeof(unsigned long)) attribute to force
alloc_percpu() allocating module_ref areas in single cache lines.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Rusty Russell <rusty@rustcorp.com.au>
CC: Tejun Heo <tj@kernel.org>
CC: Robin Holt <holt@sgi.com>
CC: David Miller <davem@davemloft.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h      | 21 ++++++++++++++++-----
 kernel/debug/kdb/kdb_main.c |  2 +-
 kernel/module.c             |  8 ++++----
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 3cb7839a60b9..4598bf03e98b 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -205,6 +205,20 @@ enum module_state
 	MODULE_STATE_GOING,
 };
 
+/**
+ * struct module_ref - per cpu module reference counts
+ * @incs: number of module get on this cpu
+ * @decs: number of module put on this cpu
+ *
+ * We force an alignment on 8 or 16 bytes, so that alloc_percpu()
+ * put @incs/@decs in same cache line, with no extra memory cost,
+ * since alloc_percpu() is fine grained.
+ */
+struct module_ref {
+	unsigned long incs;
+	unsigned long decs;
+} __attribute((aligned(2 * sizeof(unsigned long))));
+
 struct module
 {
 	enum module_state state;
@@ -347,10 +361,7 @@ struct module
 	/* Destruction function. */
 	void (*exit)(void);
 
-	struct module_ref {
-		unsigned int incs;
-		unsigned int decs;
-	} __percpu *refptr;
+	struct module_ref __percpu *refptr;
 #endif
 
 #ifdef CONFIG_CONSTRUCTORS
@@ -434,7 +445,7 @@ extern void __module_put_and_exit(struct module *mod, long code)
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code);
 
 #ifdef CONFIG_MODULE_UNLOAD
-unsigned int module_refcount(struct module *mod);
+unsigned long module_refcount(struct module *mod);
 void __symbol_put(const char *symbol);
 #define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
 void symbol_put_addr(void *addr);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 63786e71a3cd..e2ae7349437f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
 		kdb_printf("%-20s%8u  0x%p ", mod->name,
 			   mod->core_size, (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
-		kdb_printf("%4d ", module_refcount(mod));
+		kdb_printf("%4ld ", module_refcount(mod));
 #endif
 		if (mod->state == MODULE_STATE_GOING)
 			kdb_printf(" (Unloading)");
diff --git a/kernel/module.c b/kernel/module.c
index 4928cffc3dcc..14b8e82e05d4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -725,9 +725,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 	}
 }
 
-unsigned int module_refcount(struct module *mod)
+unsigned long module_refcount(struct module *mod)
 {
-	unsigned int incs = 0, decs = 0;
+	unsigned long incs = 0, decs = 0;
 	int cpu;
 
 	for_each_possible_cpu(cpu)
@@ -853,7 +853,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
 	struct module_use *use;
 	int printed_something = 0;
 
-	seq_printf(m, " %u ", module_refcount(mod));
+	seq_printf(m, " %lu ", module_refcount(mod));
 
 	/* Always include a trailing , so userspace can differentiate
            between this and the old multi-field proc format. */
@@ -903,7 +903,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
 			   struct module_kobject *mk, char *buffer)
 {
-	return sprintf(buffer, "%u\n", module_refcount(mk->mod));
+	return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
 }
 
 static struct module_attribute refcnt = {
-- 
cgit v1.2.3-71-gd317


From 5e12416927975aa3c58394cea15db6c3e488a033 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Tue, 6 Dec 2011 12:11:31 -0700
Subject: module: replace DEBUGP with pr_debug

Use more flexible pr_debug.  This allows:

  echo "module module +p" > /dbg/dynamic_debug/control

to turn on debug messages when needed.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 46 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 14b8e82e05d4..b02d6335f8a6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -62,12 +62,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt , a...)
-#endif
-
 #ifndef ARCH_SHF_SMALL
 #define ARCH_SHF_SMALL 0
 #endif
@@ -409,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name,
 		return fsa.sym;
 	}
 
-	DEBUGP("Failed to find symbol %s\n", name);
+	pr_debug("Failed to find symbol %s\n", name);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(find_symbol);
@@ -599,11 +593,11 @@ static int already_uses(struct module *a, struct module *b)
 
 	list_for_each_entry(use, &b->source_list, source_list) {
 		if (use->source == a) {
-			DEBUGP("%s uses %s!\n", a->name, b->name);
+			pr_debug("%s uses %s!\n", a->name, b->name);
 			return 1;
 		}
 	}
-	DEBUGP("%s does not use %s!\n", a->name, b->name);
+	pr_debug("%s does not use %s!\n", a->name, b->name);
 	return 0;
 }
 
@@ -618,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b)
 {
 	struct module_use *use;
 
-	DEBUGP("Allocating new usage for %s.\n", a->name);
+	pr_debug("Allocating new usage for %s.\n", a->name);
 	use = kmalloc(sizeof(*use), GFP_ATOMIC);
 	if (!use) {
 		printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -662,7 +656,7 @@ static void module_unload_free(struct module *mod)
 	mutex_lock(&module_mutex);
 	list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
 		struct module *i = use->target;
-		DEBUGP("%s unusing %s\n", mod->name, i->name);
+		pr_debug("%s unusing %s\n", mod->name, i->name);
 		module_put(i);
 		list_del(&use->source_list);
 		list_del(&use->target_list);
@@ -760,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod)
 	/* Since we might sleep for some time, release the mutex first */
 	mutex_unlock(&module_mutex);
 	for (;;) {
-		DEBUGP("Looking at refcount...\n");
+		pr_debug("Looking at refcount...\n");
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (module_refcount(mod) == 0)
 			break;
@@ -803,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 	if (mod->state != MODULE_STATE_LIVE) {
 		/* FIXME: if (force), slam module count and wake up
                    waiter --RR */
-		DEBUGP("%s already dying\n", mod->name);
+		pr_debug("%s already dying\n", mod->name);
 		ret = -EBUSY;
 		goto out;
 	}
@@ -1056,7 +1050,7 @@ static int check_version(Elf_Shdr *sechdrs,
 
 		if (versions[i].crc == maybe_relocated(*crc, crc_owner))
 			return 1;
-		DEBUGP("Found checksum %lX vs module %lX\n",
+		pr_debug("Found checksum %lX vs module %lX\n",
 		       maybe_relocated(*crc, crc_owner), versions[i].crc);
 		goto bad_version;
 	}
@@ -1833,7 +1827,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 		case SHN_COMMON:
 			/* We compiled with -fno-common.  These are not
 			   supposed to happen.  */
-			DEBUGP("Common symbol: %s\n", name);
+			pr_debug("Common symbol: %s\n", name);
 			printk("%s: please compile with -fno-common\n",
 			       mod->name);
 			ret = -ENOEXEC;
@@ -1841,7 +1835,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 
 		case SHN_ABS:
 			/* Don't need to do anything */
-			DEBUGP("Absolute symbol: 0x%08lx\n",
+			pr_debug("Absolute symbol: 0x%08lx\n",
 			       (long)sym[i].st_value);
 			break;
 
@@ -1965,7 +1959,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 	for (i = 0; i < info->hdr->e_shnum; i++)
 		info->sechdrs[i].sh_entsize = ~0UL;
 
-	DEBUGP("Core section allocation order:\n");
+	pr_debug("Core section allocation order:\n");
 	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
 		for (i = 0; i < info->hdr->e_shnum; ++i) {
 			Elf_Shdr *s = &info->sechdrs[i];
@@ -1977,7 +1971,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			    || strstarts(sname, ".init"))
 				continue;
 			s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
-			DEBUGP("\t%s\n", name);
+			pr_debug("\t%s\n", sname);
 		}
 		switch (m) {
 		case 0: /* executable */
@@ -1994,7 +1988,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 		}
 	}
 
-	DEBUGP("Init section allocation order:\n");
+	pr_debug("Init section allocation order:\n");
 	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
 		for (i = 0; i < info->hdr->e_shnum; ++i) {
 			Elf_Shdr *s = &info->sechdrs[i];
@@ -2007,7 +2001,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 				continue;
 			s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
 					 | INIT_OFFSET_MASK);
-			DEBUGP("\t%s\n", sname);
+			pr_debug("\t%s\n", sname);
 		}
 		switch (m) {
 		case 0: /* executable */
@@ -2195,7 +2189,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	symsect->sh_flags |= SHF_ALLOC;
 	symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
 					 info->index.sym) | INIT_OFFSET_MASK;
-	DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
+	pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
 
 	src = (void *)info->hdr + symsect->sh_offset;
 	nsrc = symsect->sh_size / sizeof(*src);
@@ -2216,7 +2210,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 	strsect->sh_flags |= SHF_ALLOC;
 	strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
 					 info->index.str) | INIT_OFFSET_MASK;
-	DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
+	pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
 }
 
 static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2618,7 +2612,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	mod->module_init = ptr;
 
 	/* Transfer each section which specifies SHF_ALLOC */
-	DEBUGP("final section addresses:\n");
+	pr_debug("final section addresses:\n");
 	for (i = 0; i < info->hdr->e_shnum; i++) {
 		void *dest;
 		Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2636,8 +2630,8 @@ static int move_module(struct module *mod, struct load_info *info)
 			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
 		/* Update sh_addr to point to copy in image. */
 		shdr->sh_addr = (unsigned long)dest;
-		DEBUGP("\t0x%lx %s\n",
-		       shdr->sh_addr, info->secstrings + shdr->sh_name);
+		pr_debug("\t0x%lx %s\n",
+			 (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
 	}
 
 	return 0;
@@ -2798,7 +2792,7 @@ static struct module *load_module(void __user *umod,
 	struct module *mod;
 	long err;
 
-	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+	pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
 	       umod, len, uargs);
 
 	/* Copy in the blobs from userspace, check they are vaguely sane. */
-- 
cgit v1.2.3-71-gd317


From 8487bfd954928660a52e91384a9b1f1049217e35 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Tue, 6 Dec 2011 12:11:31 -0700
Subject: kernel/params: replace DEBUGP with pr_debug

Use more flexible pr_debug.  This allows:

  echo "module params +p" > /dbg/dynamic_debug/control

to turn on debug messages when needed.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/params.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 65aae11eb93f..9240664af110 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,12 +25,6 @@
 #include <linux/slab.h>
 #include <linux/ctype.h>
 
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt, a...)
-#endif
-
 /* Protects all parameters, and incidentally kmalloced_param list. */
 static DEFINE_MUTEX(param_lock);
 
@@ -105,7 +99,7 @@ static int parse_one(char *param,
 			/* No one handled NULL, so do it here. */
 			if (!val && params[i].ops->set != param_set_bool)
 				return -EINVAL;
-			DEBUGP("They are equal!  Calling %p\n",
+			pr_debug("They are equal!  Calling %p\n",
 			       params[i].ops->set);
 			mutex_lock(&param_lock);
 			err = params[i].ops->set(val, &params[i]);
@@ -115,11 +109,11 @@ static int parse_one(char *param,
 	}
 
 	if (handle_unknown) {
-		DEBUGP("Unknown argument: calling %p\n", handle_unknown);
+		pr_debug("Unknown argument: calling %p\n", handle_unknown);
 		return handle_unknown(param, val);
 	}
 
-	DEBUGP("Unknown argument `%s'\n", param);
+	pr_debug("Unknown argument `%s'\n", param);
 	return -ENOENT;
 }
 
@@ -184,7 +178,7 @@ int parse_args(const char *name,
 {
 	char *param, *val;
 
-	DEBUGP("Parsing ARGS: %s\n", args);
+	pr_debug("Parsing ARGS: %s\n", args);
 
 	/* Chew leading spaces */
 	args = skip_spaces(args);
-- 
cgit v1.2.3-71-gd317


From cca3e707301862ca9b9327e6a732463982f8cd1b Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Fri, 13 Jan 2012 09:32:15 +1030
Subject: modules: sysfs - export: taint, coresize, initsize

Recent tools do not want to use /proc to retrieve module information. A few
values are currently missing from sysfs to replace the information available
in /proc/modules.

This adds /sys/module/*/{coresize,initsize,taint} attributes.

TAINT_PROPRIETARY_MODULE (P) and TAINT_OOT_MODULE (O) flags are both always
shown now, and do no longer exclude each other, also in /proc/modules.

Replace the open-coded sysfs attribute initializers with the __ATTR() macro.

Add the new attributes to Documentation/ABI.

Cc: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/ABI/testing/sysfs-module | 16 ++++++
 kernel/module.c                        | 93 +++++++++++++++++++++++-----------
 2 files changed, 80 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-module b/Documentation/ABI/testing/sysfs-module
index 9489ea8e294c..47064c2b1f79 100644
--- a/Documentation/ABI/testing/sysfs-module
+++ b/Documentation/ABI/testing/sysfs-module
@@ -33,3 +33,19 @@ Description:	Maximum time allowed for periodic transfers per microframe (μs)
 		Beware, non-standard modes are usually not thoroughly tested by
 		hardware designers, and the hardware can malfunction when this
 		setting differ from default 100.
+
+What:		/sys/module/*/{coresize,initsize}
+Date:		Jan 2012
+KernelVersion:»·3.3
+Contact:	Kay Sievers <kay.sievers@vrfy.org>
+Description:	Module size in bytes.
+
+What:		/sys/module/*/taint
+Date:		Jan 2012
+KernelVersion:»·3.3
+Contact:	Kay Sievers <kay.sievers@vrfy.org>
+Description:	Module taint flags:
+			P - proprietary module
+			O - out-of-tree module
+			F - force-loaded module
+			C - staging driver module
diff --git a/kernel/module.c b/kernel/module.c
index b02d6335f8a6..acf6ed3ebe81 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -842,6 +842,26 @@ out:
 	return ret;
 }
 
+static size_t module_flags_taint(struct module *mod, char *buf)
+{
+	size_t l = 0;
+
+	if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+		buf[l++] = 'P';
+	if (mod->taints & (1 << TAINT_OOT_MODULE))
+		buf[l++] = 'O';
+	if (mod->taints & (1 << TAINT_FORCED_MODULE))
+		buf[l++] = 'F';
+	if (mod->taints & (1 << TAINT_CRAP))
+		buf[l++] = 'C';
+	/*
+	 * TAINT_FORCED_RMMOD: could be added.
+	 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+	 * apply to modules.
+	 */
+	return l;
+}
+
 static inline void print_unload_info(struct seq_file *m, struct module *mod)
 {
 	struct module_use *use;
@@ -900,10 +920,8 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
 	return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
 }
 
-static struct module_attribute refcnt = {
-	.attr = { .name = "refcnt", .mode = 0444 },
-	.show = show_refcnt,
-};
+static struct module_attribute modinfo_refcnt =
+	__ATTR(refcnt, 0444, show_refcnt, NULL);
 
 void module_put(struct module *module)
 {
@@ -963,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
 	return sprintf(buffer, "%s\n", state);
 }
 
-static struct module_attribute initstate = {
-	.attr = { .name = "initstate", .mode = 0444 },
-	.show = show_initstate,
-};
+static struct module_attribute modinfo_initstate =
+	__ATTR(initstate, 0444, show_initstate, NULL);
 
 static ssize_t store_uevent(struct module_attribute *mattr,
 			    struct module_kobject *mk,
@@ -979,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr,
 	return count;
 }
 
-struct module_attribute module_uevent = {
-	.attr = { .name = "uevent", .mode = 0200 },
-	.store = store_uevent,
-};
+struct module_attribute module_uevent =
+	__ATTR(uevent, 0200, NULL, store_uevent);
+
+static ssize_t show_coresize(struct module_attribute *mattr,
+			     struct module_kobject *mk, char *buffer)
+{
+	return sprintf(buffer, "%u\n", mk->mod->core_size);
+}
+
+static struct module_attribute modinfo_coresize =
+	__ATTR(coresize, 0444, show_coresize, NULL);
+
+static ssize_t show_initsize(struct module_attribute *mattr,
+			     struct module_kobject *mk, char *buffer)
+{
+	return sprintf(buffer, "%u\n", mk->mod->init_size);
+}
+
+static struct module_attribute modinfo_initsize =
+	__ATTR(initsize, 0444, show_initsize, NULL);
+
+static ssize_t show_taint(struct module_attribute *mattr,
+			  struct module_kobject *mk, char *buffer)
+{
+	size_t l;
+
+	l = module_flags_taint(mk->mod, buffer);
+	buffer[l++] = '\n';
+	return l;
+}
+
+static struct module_attribute modinfo_taint =
+	__ATTR(taint, 0444, show_taint, NULL);
 
 static struct module_attribute *modinfo_attrs[] = {
+	&module_uevent,
 	&modinfo_version,
 	&modinfo_srcversion,
-	&initstate,
-	&module_uevent,
+	&modinfo_initstate,
+	&modinfo_coresize,
+	&modinfo_initsize,
+	&modinfo_taint,
 #ifdef CONFIG_MODULE_UNLOAD
-	&refcnt,
+	&modinfo_refcnt,
 #endif
 	NULL,
 };
@@ -3236,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf)
 	    mod->state == MODULE_STATE_GOING ||
 	    mod->state == MODULE_STATE_COMING) {
 		buf[bx++] = '(';
-		if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
-			buf[bx++] = 'P';
-		else if (mod->taints & (1 << TAINT_OOT_MODULE))
-			buf[bx++] = 'O';
-		if (mod->taints & (1 << TAINT_FORCED_MODULE))
-			buf[bx++] = 'F';
-		if (mod->taints & (1 << TAINT_CRAP))
-			buf[bx++] = 'C';
-		/*
-		 * TAINT_FORCED_RMMOD: could be added.
-		 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
-		 * apply to modules.
-		 */
-
+		bx += module_flags_taint(mod, buf + bx);
 		/* Show a - for module-is-being-unloaded */
 		if (mod->state == MODULE_STATE_GOING)
 			buf[bx++] = '-';
-- 
cgit v1.2.3-71-gd317


From 69116f279a9eaf4c540934269342d9149538fc79 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:17 +1030
Subject: module_param: avoid bool abuse, add bint for special cases.

For historical reasons, we allow module_param(bool) to take an int (or
an unsigned int).  That's going away.

A few drivers really want an int: they set it to -1 and a parameter
will set it to 0 or 1.  This sucks: reading them from sysfs will give
'Y' for both -1 and 1, but if we change it to an int, then the users
might be broken (if they did "param" instead of "param=1").

Use a new 'bint' parser for them.

(ntfs has a different problem: it needs an int for debug_msgs because
it's also exposed via sysctl.)

Cc: Steve Glendinning <steve.glendinning@smsc.com>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Guenter Roeck <guenter.roeck@ericsson.com>
Cc: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
Cc: Christoph Raisch <raisch@de.ibm.com>
Cc: Roland Dreier <roland@kernel.org>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
Cc: linux390@de.ibm.com
Cc: Anton Altaparmakov <anton@tuxera.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: lm-sensors@lm-sensors.org
Cc: linux-rdma@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linux-ntfs-dev@lists.sourceforge.net
Cc: alsa-devel@alsa-project.org
Acked-by: Takashi Iwai <tiwai@suse.de> (For the sound part)
Acked-by: Guenter Roeck <guenter.roeck@ericsson.com> (For the hwmon driver)
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/hwmon/emc2103.c                |  2 +-
 drivers/infiniband/hw/ehca/ehca_main.c |  2 +-
 drivers/s390/cio/cmf.c                 |  2 +-
 fs/ntfs/super.c                        |  2 +-
 include/linux/moduleparam.h            |  6 ++++++
 kernel/params.c                        | 24 ++++++++++++++++++++++++
 sound/pci/intel8x0.c                   |  4 ++--
 7 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/hwmon/emc2103.c b/drivers/hwmon/emc2103.c
index 848a2b0bc83f..865063914d76 100644
--- a/drivers/hwmon/emc2103.c
+++ b/drivers/hwmon/emc2103.c
@@ -55,7 +55,7 @@ static const u8 REG_TEMP_MAX[4] = { 0x34, 0x30, 0x31, 0x32 };
  * it.  Default is to leave the device in the state it's already in (-1).
  * This parameter allows APD mode to be optionally forced on or off */
 static int apd = -1;
-module_param(apd, bool, 0);
+module_param(apd, bint, 0);
 MODULE_PARM_DESC(init, "Set to zero to disable anti-parallel diode mode");
 
 struct temperature {
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index c240e9972cb0..8af8d4f7bdb1 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -82,7 +82,7 @@ module_param_named(port_act_time, ehca_port_act_time, int,  S_IRUGO);
 module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  bool, S_IRUGO);
 module_param_named(static_rate,   ehca_static_rate,   int,  S_IRUGO);
 module_param_named(scaling_code,  ehca_scaling_code,  bool, S_IRUGO);
-module_param_named(lock_hcalls,   ehca_lock_hcalls,   bool, S_IRUGO);
+module_param_named(lock_hcalls,   ehca_lock_hcalls,   bint, S_IRUGO);
 module_param_named(number_of_cqs, ehca_max_cq,        int,  S_IRUGO);
 module_param_named(number_of_qps, ehca_max_qp,        int,  S_IRUGO);
 
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index 2985eb439485..204ca728e7fd 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -98,7 +98,7 @@ enum cmb_format {
  * enum cmb_format.
  */
 static int format = CMF_AUTODETECT;
-module_param(format, bool, 0444);
+module_param(format, bint, 0444);
 
 /**
  * struct cmb_operations - functions to use depending on cmb_format
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 608be4516091..5a4a8af5c406 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3198,7 +3198,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
-module_param(debug_msgs, bool, 0);
+module_param(debug_msgs, bint, 0);
 MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
 #endif
 
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 794d4b0f1215..6bdde0c3bcca 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -367,6 +367,12 @@ extern int param_set_invbool(const char *val, const struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, const struct kernel_param *kp);
 #define param_check_invbool(name, p) __param_check(name, p, bool)
 
+/* An int, which can only be set like a bool (though it shows as an int). */
+extern struct kernel_param_ops param_ops_bint;
+extern int param_set_bint(const char *val, const struct kernel_param *kp);
+#define param_get_bint param_get_int
+#define param_check_bint param_check_int
+
 /**
  * module_param_array - a parameter which is an array of some type
  * @name: the name of the array variable
diff --git a/kernel/params.c b/kernel/params.c
index 9240664af110..32ee04308285 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -363,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = {
 };
 EXPORT_SYMBOL(param_ops_invbool);
 
+int param_set_bint(const char *val, const struct kernel_param *kp)
+{
+	struct kernel_param boolkp;
+	bool v;
+	int ret;
+
+	/* Match bool exactly, by re-using it. */
+	boolkp = *kp;
+	boolkp.arg = &v;
+	boolkp.flags |= KPARAM_ISBOOL;
+
+	ret = param_set_bool(val, &boolkp);
+	if (ret == 0)
+		*(int *)kp->arg = v;
+	return ret;
+}
+EXPORT_SYMBOL(param_set_bint);
+
+struct kernel_param_ops param_ops_bint = {
+	.set = param_set_bint,
+	.get = param_get_int,
+};
+EXPORT_SYMBOL(param_ops_bint);
+
 /* We break the rule and mangle the string. */
 static int param_array(const char *name,
 		       const char *val,
diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c
index 40b181bab930..9f3b01bb72c8 100644
--- a/sound/pci/intel8x0.c
+++ b/sound/pci/intel8x0.c
@@ -95,13 +95,13 @@ module_param(ac97_quirk, charp, 0444);
 MODULE_PARM_DESC(ac97_quirk, "AC'97 workaround for strange hardware.");
 module_param(buggy_semaphore, bool, 0444);
 MODULE_PARM_DESC(buggy_semaphore, "Enable workaround for hardwares with problematic codec semaphores.");
-module_param(buggy_irq, bool, 0444);
+module_param(buggy_irq, bint, 0444);
 MODULE_PARM_DESC(buggy_irq, "Enable workaround for buggy interrupts on some motherboards.");
 module_param(xbox, bool, 0444);
 MODULE_PARM_DESC(xbox, "Set to 1 for Xbox, if you have problems with the AC'97 codec detection.");
 module_param(spdif_aclink, int, 0444);
 MODULE_PARM_DESC(spdif_aclink, "S/PDIF over AC-link.");
-module_param(inside_vm, bool, 0444);
+module_param(inside_vm, bint, 0444);
 MODULE_PARM_DESC(inside_vm, "KVM/Parallels optimization.");
 
 /* just for backward compatibility */
-- 
cgit v1.2.3-71-gd317


From 29d4d6df107b9d86982dc759f5b1ddfe2c6b29c0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:17 +1030
Subject: printk: fix unnecessary module_param_name.

You don't need module_param_name if the name is the same!

Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 989e4a52da76..63b3bc31fe32 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str)
 }
 
 early_param("ignore_loglevel", ignore_loglevel_setup);
-module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
 	"print all kernel messages to the console.");
 
-- 
cgit v1.2.3-71-gd317


From 6d6a55ec0877393f467067d44b9a2a8c2e4a82d2 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:18 +1030
Subject: kernel/async: remove redundant declaration.

It's in linux/init.h, and I'm about to change it to a bool.

Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/async.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index 80b74b88fefe..bd0c168a3bbe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
 
 static atomic_t entry_count;
 
-extern int initcall_debug;
-
 
 /*
  * MUST be called with the lock held!
-- 
cgit v1.2.3-71-gd317


From 2329abfa344a9a824bc4c71f2415528777265510 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:18 +1030
Subject: module_param: make bool parameters really bool (core code)

module_param(bool) used to counter-intuitively take an int.  In
fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy
trick.

It's time to remove the int/unsigned int option.  For this version
it'll simply give a warning, but it'll break next kernel version.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/init.h   | 3 ++-
 init/main.c            | 2 +-
 kernel/irq/internals.h | 2 +-
 kernel/irq/spurious.c  | 2 +-
 kernel/printk.c        | 8 ++++----
 5 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init.h b/include/linux/init.h
index 9146f39cdddf..6b951095a42f 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -2,6 +2,7 @@
 #define _LINUX_INIT_H
 
 #include <linux/compiler.h>
+#include <linux/types.h>
 
 /* These macros are used to mark some functions or 
  * initialized data (doesn't apply to uninitialized data)
@@ -156,7 +157,7 @@ void prepare_namespace(void);
 
 extern void (*late_time_init)(void);
 
-extern int initcall_debug;
+extern bool initcall_debug;
 
 #endif
   
diff --git a/init/main.c b/init/main.c
index 415548e808d2..ff49a6dacfbb 100644
--- a/init/main.c
+++ b/init/main.c
@@ -648,7 +648,7 @@ static void __init do_ctors(void)
 #endif
 }
 
-int initcall_debug;
+bool initcall_debug;
 core_param(initcall_debug, initcall_debug, bool, 0644);
 
 static char msgbuf[64];
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a73dd6c7372d..b7952316016a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,7 +15,7 @@
 
 #define istate core_internal_state__do_not_mess_with_it
 
-extern int noirqdebug;
+extern bool noirqdebug;
 
 /*
  * Bits used by threaded handlers:
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dc813a948be2..611cd6003c45 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 	desc->irqs_unhandled = 0;
 }
 
-int noirqdebug __read_mostly;
+bool noirqdebug __read_mostly;
 
 int noirqdebug_setup(char *str)
 {
diff --git a/kernel/printk.c b/kernel/printk.c
index 63b3bc31fe32..13c0a1143f49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
 	}
 }
 
-static int __read_mostly ignore_loglevel;
+static bool __read_mostly ignore_loglevel;
 
 static int __init ignore_loglevel_setup(char *str)
 {
@@ -696,9 +696,9 @@ static void zap_locks(void)
 }
 
 #if defined(CONFIG_PRINTK_TIME)
-static int printk_time = 1;
+static bool printk_time = 1;
 #else
-static int printk_time = 0;
+static bool printk_time = 0;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
 
@@ -1098,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
 	return -1;
 }
 
-int console_suspend_enabled = 1;
+bool console_suspend_enabled = 1;
 EXPORT_SYMBOL(console_suspend_enabled);
 
 static int __init console_suspend_disable(char *str)
-- 
cgit v1.2.3-71-gd317


From efeb156e7275c5b6c6e0f96aceb3c6abf98fc392 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 12 Jan 2012 17:17:11 -0800
Subject: kprobes: silence DEBUG_STRICT_USER_COPY_CHECKS=y warning

Enabling DEBUG_STRICT_USER_COPY_CHECKS causes the following warning:

  In file included from arch/x86/include/asm/uaccess.h:573,
                   from kernel/kprobes.c:55:
  In function 'copy_from_user',
      inlined from 'write_enabled_file_bool' at
      kernel/kprobes.c:2191:
  arch/x86/include/asm/uaccess_64.h:65:
  warning: call to 'copy_from_user_overflow' declared with attribute warning: copy_from_user() buffer size is not provably correct

presumably due to buf_size being signed causing GCC to fail to see that
buf_size can't become negative.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5d84644823b..95dd7212e610 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
 	       const char __user *user_buf, size_t count, loff_t *ppos)
 {
 	char buf[32];
-	int buf_size;
+	size_t buf_size;
 
 	buf_size = min(count, (sizeof(buf)-1));
 	if (copy_from_user(buf, user_buf, buf_size))
-- 
cgit v1.2.3-71-gd317


From 9402c95f34a66e81eba473a2f7267bbae5a1dee2 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 12 Jan 2012 17:17:17 -0800
Subject: treewide: remove useless NORET_TYPE macro and uses

It's a very old and now unused prototype marking so just delete it.

Neaten panic pointer argument style to keep checkpatch quiet.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/avr32/include/asm/system.h        | 2 +-
 arch/avr32/kernel/traps.c              | 2 +-
 arch/ia64/kernel/machine_kexec.c       | 2 +-
 arch/m68k/amiga/config.c               | 2 +-
 arch/mips/include/asm/ptrace.h         | 2 +-
 arch/mips/kernel/traps.c               | 2 +-
 arch/powerpc/kernel/machine_kexec_32.c | 2 +-
 arch/powerpc/kernel/machine_kexec_64.c | 6 +++---
 arch/s390/kernel/nmi.c                 | 2 +-
 arch/tile/kernel/machine_kexec.c       | 4 ++--
 include/linux/kernel.h                 | 6 +++---
 include/linux/linkage.h                | 1 -
 include/linux/sched.h                  | 2 +-
 kernel/exit.c                          | 6 +++---
 kernel/panic.c                         | 2 +-
 15 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/avr32/include/asm/system.h b/arch/avr32/include/asm/system.h
index 9702c2213e1e..62d9ded01635 100644
--- a/arch/avr32/include/asm/system.h
+++ b/arch/avr32/include/asm/system.h
@@ -169,7 +169,7 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
 
 struct pt_regs;
-void NORET_TYPE die(const char *str, struct pt_regs *regs, long err);
+void die(const char *str, struct pt_regs *regs, long err);
 void _exception(long signr, struct pt_regs *regs, int code,
 		unsigned long addr);
 
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
index 7aa25756412f..3d760c06f024 100644
--- a/arch/avr32/kernel/traps.c
+++ b/arch/avr32/kernel/traps.c
@@ -24,7 +24,7 @@
 
 static DEFINE_SPINLOCK(die_lock);
 
-void NORET_TYPE die(const char *str, struct pt_regs *regs, long err)
+void die(const char *str, struct pt_regs *regs, long err)
 {
 	static int die_counter;
 
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index 3d3aeef46947..581a16d5e85b 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -27,7 +27,7 @@
 #include <asm/sal.h>
 #include <asm/mca.h>
 
-typedef NORET_TYPE void (*relocate_new_kernel_t)(
+typedef void (*relocate_new_kernel_t)(
 					unsigned long indirection_page,
 					unsigned long start_address,
 					struct ia64_boot_param *boot_param,
diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c
index 82a4bb51d5d8..a3b0558328b6 100644
--- a/arch/m68k/amiga/config.c
+++ b/arch/m68k/amiga/config.c
@@ -511,7 +511,7 @@ static unsigned long amiga_gettimeoffset(void)
 	return ticks + offset;
 }
 
-static NORET_TYPE void amiga_reset(void)
+static void amiga_reset(void)
     ATTRIB_NORET;
 
 static void amiga_reset(void)
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index de39b1f343ea..3d913259e507 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -144,7 +144,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
 extern asmlinkage void syscall_trace_enter(struct pt_regs *regs);
 extern asmlinkage void syscall_trace_leave(struct pt_regs *regs);
 
-extern NORET_TYPE void die(const char *, struct pt_regs *) ATTRIB_NORET;
+extern void die(const char *, struct pt_regs *) ATTRIB_NORET;
 
 static inline void die_if_kernel(const char *str, struct pt_regs *regs)
 {
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 5c8a49d55054..725e9a5ca966 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -1340,7 +1340,7 @@ void ejtag_exception_handler(struct pt_regs *regs)
 /*
  * NMI exception handler.
  */
-NORET_TYPE void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs)
+void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs)
 {
 	bust_spinlocks(1);
 	printk("NMI taken!!!!\n");
diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c
index e63f2e7d2efb..026e7f153949 100644
--- a/arch/powerpc/kernel/machine_kexec_32.c
+++ b/arch/powerpc/kernel/machine_kexec_32.c
@@ -16,7 +16,7 @@
 #include <asm/hw_irq.h>
 #include <asm/io.h>
 
-typedef NORET_TYPE void (*relocate_new_kernel_t)(
+typedef void (*relocate_new_kernel_t)(
 				unsigned long indirection_page,
 				unsigned long reboot_code_buffer,
 				unsigned long start_address) ATTRIB_NORET;
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 26ccbf77dd41..5fbbf814923a 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -307,9 +307,9 @@ static union thread_union kexec_stack __init_task_data =
 struct paca_struct kexec_paca;
 
 /* Our assembly helper, in kexec_stub.S */
-extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start,
-					void *image, void *control,
-					void (*clear_all)(void)) ATTRIB_NORET;
+extern void kexec_sequence(void *newstack, unsigned long start,
+			   void *image, void *control,
+			   void (*clear_all)(void)) ATTRIB_NORET;
 
 /* too late to fail here */
 void default_machine_kexec(struct kimage *image)
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index fab88431a06f..0fd2e863e114 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -30,7 +30,7 @@ struct mcck_struct {
 
 static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
 
-static NORET_TYPE void s390_handle_damage(char *msg)
+static void s390_handle_damage(char *msg)
 {
 	smp_send_stop();
 	disabled_wait((unsigned long) __builtin_return_address(0));
diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c
index e00d7179989e..b0c907059067 100644
--- a/arch/tile/kernel/machine_kexec.c
+++ b/arch/tile/kernel/machine_kexec.c
@@ -248,10 +248,10 @@ static void setup_quasi_va_is_pa(void)
 }
 
 
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	void *reboot_code_buffer;
-	NORET_TYPE void (*rnk)(unsigned long, void *, unsigned long)
+	void (*rnk)(unsigned long, void *, unsigned long)
 		ATTRIB_NORET;
 
 	/* Mask all interrupts before starting to reboot. */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 60934395e36c..aaf1753dd2b3 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -185,16 +185,16 @@ static inline void might_fault(void)
 
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(int state);
-NORET_TYPE __printf(1, 2)
+__printf(1, 2)
 void panic(const char *fmt, ...)
 	ATTRIB_NORET __cold;
 extern void oops_enter(void);
 extern void oops_exit(void);
 void print_oops_end_marker(void);
 extern int oops_may_print(void);
-NORET_TYPE void do_exit(long error_code)
+void do_exit(long error_code)
 	ATTRIB_NORET;
-NORET_TYPE void complete_and_exit(struct completion *, long)
+void complete_and_exit(struct completion *, long)
 	ATTRIB_NORET;
 
 /* Internal, do not use. */
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index c75074cb8ad4..6a8f252e49ee 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -88,7 +88,6 @@
 
 #endif
 
-#define NORET_TYPE    /**/
 #define ATTRIB_NORET  __attribute__((noreturn))
 
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21cd0303af51..4032ec1cf836 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2275,7 +2275,7 @@ extern void __cleanup_sighand(struct sighand_struct *);
 extern void exit_itimers(struct signal_struct *);
 extern void flush_itimer_signals(void);
 
-extern NORET_TYPE void do_group_exit(int);
+extern void do_group_exit(int);
 
 extern void daemonize(const char *, ...);
 extern int allow_signal(int);
diff --git a/kernel/exit.c b/kernel/exit.c
index 94ed6e20bb53..c44738267be7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -887,7 +887,7 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
 
-NORET_TYPE void do_exit(long code)
+void do_exit(long code)
 {
 	struct task_struct *tsk = current;
 	int group_dead;
@@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code)
 
 EXPORT_SYMBOL_GPL(do_exit);
 
-NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+void complete_and_exit(struct completion *comp, long code)
 {
 	if (comp)
 		complete(comp);
@@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
  * Take down every thread in the group.  This is called by fatal signals
  * as well as by sys_exit_group (below).
  */
-NORET_TYPE void
+void
 do_group_exit(int exit_code)
 {
 	struct signal_struct *sig = current->signal;
diff --git a/kernel/panic.c b/kernel/panic.c
index 3458469eb7c3..6fd09ed6fd90 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -57,7 +57,7 @@ EXPORT_SYMBOL(panic_blink);
  *
  *	This function never returns.
  */
-NORET_TYPE void panic(const char * fmt, ...)
+void panic(const char *fmt, ...)
 {
 	static char buf[1024];
 	va_list args;
-- 
cgit v1.2.3-71-gd317


From a3dd3323058d281abd584b15ad4c5b65064d7a61 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 12 Jan 2012 17:20:11 -0800
Subject: kexec: remove KMSG_DUMP_KEXEC

KMSG_DUMP_KEXEC is useless because we already save kernel messages inside
/proc/vmcore, and it is unsafe to allow modules to do other stuffs in a
crash dump scenario.

[akpm@linux-foundation.org: fix powerpc build]
Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Reported-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Jarod Wilson <jarod@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/pseries/nvram.c | 1 -
 drivers/char/ramoops.c                 | 3 +--
 drivers/mtd/mtdoops.c                  | 3 +--
 include/linux/kmsg_dump.h              | 1 -
 kernel/kexec.c                         | 3 ---
 5 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 330a57b7c17c..36f957f31842 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -638,7 +638,6 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
 		/* These are almost always orderly shutdowns. */
 		return;
 	case KMSG_DUMP_OOPS:
-	case KMSG_DUMP_KEXEC:
 		break;
 	case KMSG_DUMP_PANIC:
 		panicking = true;
diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c
index 7c7f42a1f880..feda90cdac9f 100644
--- a/drivers/char/ramoops.c
+++ b/drivers/char/ramoops.c
@@ -83,8 +83,7 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper,
 	struct timeval timestamp;
 
 	if (reason != KMSG_DUMP_OOPS &&
-	    reason != KMSG_DUMP_PANIC &&
-	    reason != KMSG_DUMP_KEXEC)
+	    reason != KMSG_DUMP_PANIC)
 		return;
 
 	/* Only dump oopses if dump_oops is set */
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index db8e8272d69b..3ce99e00a49e 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -315,8 +315,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper,
 	char *dst;
 
 	if (reason != KMSG_DUMP_OOPS &&
-	    reason != KMSG_DUMP_PANIC &&
-	    reason != KMSG_DUMP_KEXEC)
+	    reason != KMSG_DUMP_PANIC)
 		return;
 
 	/* Only dump oopses if dump_oops is set */
diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index ee0c952188de..fee66317e071 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -18,7 +18,6 @@
 enum kmsg_dump_reason {
 	KMSG_DUMP_OOPS,
 	KMSG_DUMP_PANIC,
-	KMSG_DUMP_KEXEC,
 	KMSG_DUMP_RESTART,
 	KMSG_DUMP_HALT,
 	KMSG_DUMP_POWEROFF,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 090ee10d9604..20ed47ae252f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
 #include <linux/console.h>
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
-#include <linux/kmsg_dump.h>
 #include <linux/syscore_ops.h>
 
 #include <asm/page.h>
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
 		if (kexec_crash_image) {
 			struct pt_regs fixed_regs;
 
-			kmsg_dump(KMSG_DUMP_KEXEC);
-
 			crash_setup_regs(&fixed_regs, regs);
 			crash_save_vmcoreinfo();
 			machine_crash_shutdown(&fixed_regs);
-- 
cgit v1.2.3-71-gd317


From 6480e5a0923756b500634d9777ec4189492fbbfe Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Thu, 12 Jan 2012 17:20:14 -0800
Subject: kdump: add missing RAM resource in crash_shrink_memory()

When shrinking crashkernel memory using /sys/kernel/kexec_crash_size for
the newly added memory no RAM resource is created at the moment.

Example:

  $ cat /proc/iomem
  00000000-bfffffff : System RAM
    00000000-005b7ac3 : Kernel code
    005b7ac4-009743bf : Kernel data
    009bb000-00a85c33 : Kernel bss
  c0000000-cfffffff : Crash kernel
  d0000000-ffffffff : System RAM

  $ echo 0 > /sys/kernel/kexec_crash_size
  $ cat /proc/iomem
  00000000-bfffffff : System RAM
    00000000-005b7ac3 : Kernel code
    005b7ac4-009743bf : Kernel data
    009bb000-00a85c33 : Kernel bss
                                   <<-- here is System RAM missing
  d0000000-ffffffff : System RAM

One result of this bug is that the memory chunk can never be set offline
using memory hotplug.  With this patch I insert a new "System RAM"
resource for the released memory.  Then the upper example looks like the
following:

  $ echo 0 > /sys/kernel/kexec_crash_size
  $ cat /proc/iomem
  00000000-bfffffff : System RAM
    00000000-005b7ac3 : Kernel code
    005b7ac4-009743bf : Kernel data
    009bb000-00a85c33 : Kernel bss
  c0000000-cfffffff : System RAM   <<-- new rescoure
  d0000000-ffffffff : System RAM

And now I can set chunk c0000000-cfffffff offline.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 20ed47ae252f..60bf181b3eae 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1129,6 +1129,7 @@ int crash_shrink_memory(unsigned long new_size)
 {
 	int ret = 0;
 	unsigned long start, end;
+	struct resource *ram_res;
 
 	mutex_lock(&kexec_mutex);
 
@@ -1146,6 +1147,12 @@ int crash_shrink_memory(unsigned long new_size)
 		goto unlock;
 	}
 
+	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+	if (!ram_res) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
 	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
 	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
 
@@ -1154,7 +1161,15 @@ int crash_shrink_memory(unsigned long new_size)
 
 	if ((start == end) && (crashk_res.parent != NULL))
 		release_resource(&crashk_res);
+
+	ram_res->start = end;
+	ram_res->end = crashk_res.end;
+	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+	ram_res->name = "System RAM";
+
 	crashk_res.end = end - 1;
+
+	insert_resource(&iomem_resource, ram_res);
 	crash_unmap_reserved_pages();
 
 unlock:
-- 
cgit v1.2.3-71-gd317


From bec013c40bc89671d8d457944fdf7d2b8e79d651 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Thu, 12 Jan 2012 17:20:15 -0800
Subject: kdump: crashk_res init check for /sys/kernel/kexec_crash_size

Currently it is possible to set the crash_size via the sysfs
/sys/kernel/kexec_crash_size even if no crash kernel memory has been
defined with the "crashkernel" parameter.  In this case "crashk_res" is
not initialized and crashk_res.start = crashk_res.end = 0.  Unfortunately
resource_size(&crashk_res) returns 1 in this case.  This breaks the s390
implementation of crash_(un)map_reserved_pages().

To fix the problem the correct "old_size" is now calculated in
crash_shrink_memory().  "old_size is set to "0" if crashk_res is not
initialized.  With this change crash_shrink_memory() will do nothing, when
"crashk_res" is not initialized.  It will return "0" for "echo 0 >
/sys/kernel/kexec_crash_size" and -EINVAL for "echo [not zero] >
/sys/kernel/kexec_crash_size".

In addition to that this patch also simplifies the "ret = -EINVAL" vs.
"ret = 0" logic as suggested by Simon Horman.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Reviewed-by: Dave Young <dyoung@redhat.com>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Reviewed-by: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 60bf181b3eae..7b0886786701 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1129,6 +1129,7 @@ int crash_shrink_memory(unsigned long new_size)
 {
 	int ret = 0;
 	unsigned long start, end;
+	unsigned long old_size;
 	struct resource *ram_res;
 
 	mutex_lock(&kexec_mutex);
@@ -1139,11 +1140,9 @@ int crash_shrink_memory(unsigned long new_size)
 	}
 	start = crashk_res.start;
 	end = crashk_res.end;
-
-	if (new_size >= end - start + 1) {
-		ret = -EINVAL;
-		if (new_size == end - start + 1)
-			ret = 0;
+	old_size = (end == 0) ? 0 : end - start + 1;
+	if (new_size >= old_size) {
+		ret = (new_size == old_size) ? 0 : -EINVAL;
 		goto unlock;
 	}
 
-- 
cgit v1.2.3-71-gd317


From 93e13a360ba331915220f82f6e9543df961ffa1f Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Thu, 12 Jan 2012 17:20:18 -0800
Subject: kdump: fix crash_kexec()/smp_send_stop() race in panic()

When two CPUs call panic at the same time there is a possible race
condition that can stop kdump.  The first CPU calls crash_kexec() and the
second CPU calls smp_send_stop() in panic() before crash_kexec() finished
on the first CPU.  So the second CPU stops the first CPU and therefore
kdump fails:

1st CPU:
  panic()->crash_kexec()->mutex_trylock(&kexec_mutex)-> do kdump

2nd CPU:
  panic()->crash_kexec()->kexec_mutex already held by 1st CPU
       ->smp_send_stop()-> stop 1st CPU (stop kdump)

This patch fixes the problem by introducing a spinlock in panic that
allows only one CPU to process crash_kexec() and the subsequent panic
code.

All other CPUs call the weak function panic_smp_self_stop() that stops the
CPU itself.  This function can be overloaded by architecture code.  For
example "tile" can use their lower-power "nap" instruction for that.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6fd09ed6fd90..5dce5404eeef 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
 long (*panic_blink)(int state);
 EXPORT_SYMBOL(panic_blink);
 
+/*
+ * Stop ourself in panic -- architecture code may override this
+ */
+void __weak panic_smp_self_stop(void)
+{
+	while (1)
+		cpu_relax();
+}
+
 /**
  *	panic - halt the system
  *	@fmt: The text string to print
@@ -59,6 +68,7 @@ EXPORT_SYMBOL(panic_blink);
  */
 void panic(const char *fmt, ...)
 {
+	static DEFINE_SPINLOCK(panic_lock);
 	static char buf[1024];
 	va_list args;
 	long i, i_next = 0;
@@ -68,8 +78,14 @@ void panic(const char *fmt, ...)
 	 * It's possible to come here directly from a panic-assertion and
 	 * not have preempt disabled. Some functions called from here want
 	 * preempt to be disabled. No point enabling it later though...
+	 *
+	 * Only one CPU is allowed to execute the panic code from here. For
+	 * multiple parallel invocations of panic, all other CPUs either
+	 * stop themself or will wait until they are stopped by the 1st CPU
+	 * with smp_send_stop().
 	 */
-	preempt_disable();
+	if (!spin_trylock(&panic_lock))
+		panic_smp_self_stop();
 
 	console_verbose();
 	bust_spinlocks(1);
-- 
cgit v1.2.3-71-gd317


From b8f566b04d3cddd192cfd2418ae6d54ac6353792 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Thu, 12 Jan 2012 17:20:27 -0800
Subject: sysctl: add the kernel.ns_last_pid control

The sysctl works on the current task's pid namespace, getting and setting
its last_pid field.

Writing is allowed for CAP_SYS_ADMIN-capable tasks thus making it possible
to create a task with desired pid value.  This ability is required badly
for the checkpoint/restore in userspace.

This approach suits all the parties for now.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt |  8 ++++++++
 kernel/pid.c                    |  4 +++-
 kernel/pid_namespace.c          | 31 +++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 6d8cd8b2c30d..8c20fbd8b42d 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -415,6 +415,14 @@ PIDs of value pid_max or larger are not allocated.
 
 ==============================================================
 
+ns_last_pid:
+
+The last pid allocated in the current (the one task using this sysctl
+lives in) pid namespace. When selecting a pid for a next task on fork
+kernel tries to allocate a number starting from this one.
+
+==============================================================
+
 powersave-nap: (PPC only)
 
 If set, Linux-PPC will use the 'nap' mode of powersaving,
diff --git a/kernel/pid.c b/kernel/pid.c
index fa5f72227e5f..ce8e00deaccb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
 }
 
 /*
- * We might be racing with someone else trying to set pid_ns->last_pid.
+ * We might be racing with someone else trying to set pid_ns->last_pid
+ * at the pid allocation time (there's also a sysctl for this, but racing
+ * with this one is OK, see comment in kernel/pid_namespace.c about it).
  * We want the winner to have the "later" value, because if the
  * "earlier" value prevails, then a pid may get reused immediately.
  *
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca6..a8968396046d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	return;
 }
 
+static int pid_ns_ctl_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/*
+	 * Writing directly to ns' last_pid field is OK, since this field
+	 * is volatile in a living namespace anyway and a code writing to
+	 * it should synchronize its usage with external means.
+	 */
+
+	tmp.data = &current->nsproxy->pid_ns->last_pid;
+	return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table pid_ns_ctl_table[] = {
+	{
+		.procname = "ns_last_pid",
+		.maxlen = sizeof(int),
+		.mode = 0666, /* permissions are checked in the handler */
+		.proc_handler = pid_ns_ctl_handler,
+	},
+	{ }
+};
+
+static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+
 static __init int pid_namespaces_init(void)
 {
 	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+	register_sysctl_paths(kern_path, pid_ns_ctl_table);
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From 6e6f0a1f0fa6bba1493c296eb30d1e176e1f8530 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 12 Jan 2012 17:20:30 -0800
Subject: panic: don't print redundant backtraces on oops

When an oops causes a panic and panic prints another backtrace it's pretty
common to have the original oops data be scrolled away on a 80x50 screen.

The second backtrace is quite redundant and not needed anyways.

So don't print the panic backtrace when oops_in_progress is true.

[akpm@linux-foundation.org: add comment]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 5dce5404eeef..80aed44e345a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -94,7 +94,11 @@ void panic(const char *fmt, ...)
 	va_end(args);
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
 #ifdef CONFIG_DEBUG_BUGVERBOSE
-	dump_stack();
+	/*
+	 * Avoid nested stack-dumping if a panic occurs during oops processing
+	 */
+	if (!oops_in_progress)
+		dump_stack();
 #endif
 
 	/*
-- 
cgit v1.2.3-71-gd317


From 028ee4be34a09a6d48bdf30ab991ae933a7bc036 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 12 Jan 2012 17:20:55 -0800
Subject: c/r: prctl: add PR_SET_MM codes to set up mm_struct entries

When we restore a task we need to set up text, data and data heap sizes
from userspace to the values a task had at checkpoint time.  This patch
adds auxilary prctl codes for that.

While most of them have a statistical nature (their values are involved
into calculation of /proc/<pid>/statm output) the start_brk and brk values
are used to compute an allowed size of program data segment expansion.
Which means an arbitrary changes of this values might be dangerous
operation.  So to restrict access the following requirements applied to
prctl calls:

 - The process has to have CAP_SYS_ADMIN capability granted.
 - For all opcodes except start_brk/brk members an appropriate
   VMA area must exist and should fit certain VMA flags,
   such as:
   - code segment must be executable but not writable;
   - data segment must not be executable.

start_brk/brk values must not intersect with data segment and must not
exceed RLIMIT_DATA resource limit.

Still the main guard is CAP_SYS_ADMIN capability check.

Note the kernel should be compiled with CONFIG_CHECKPOINT_RESTORE support
otherwise these prctl calls will return -EINVAL.

[akpm@linux-foundation.org: cache current->mm in a local, saving 200 bytes text]
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Vagin <avagin@openvz.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h |  12 +++++
 kernel/sys.c          | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2c2161..7ddc7f1b480f 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,16 @@
 
 #define PR_MCE_KILL_GET 34
 
+/*
+ * Tune up process memory map specifics.
+ */
+#define PR_SET_MM		35
+# define PR_SET_MM_START_CODE		1
+# define PR_SET_MM_END_CODE		2
+# define PR_SET_MM_START_DATA		3
+# define PR_SET_MM_END_DATA		4
+# define PR_SET_MM_START_STACK		5
+# define PR_SET_MM_START_BRK		6
+# define PR_SET_MM_BRK			7
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index ddf8155bf3f8..40701538fbd1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm(int opt, unsigned long addr,
+			unsigned long arg4, unsigned long arg5)
+{
+	unsigned long rlim = rlimit(RLIMIT_DATA);
+	unsigned long vm_req_flags;
+	unsigned long vm_bad_flags;
+	struct vm_area_struct *vma;
+	int error = 0;
+	struct mm_struct *mm = current->mm;
+
+	if (arg4 | arg5)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (addr >= TASK_SIZE)
+		return -EINVAL;
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, addr);
+
+	if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
+		/* It must be existing VMA */
+		if (!vma || vma->vm_start > addr)
+			goto out;
+	}
+
+	error = -EINVAL;
+	switch (opt) {
+	case PR_SET_MM_START_CODE:
+	case PR_SET_MM_END_CODE:
+		vm_req_flags = VM_READ | VM_EXEC;
+		vm_bad_flags = VM_WRITE | VM_MAYSHARE;
+
+		if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+		    (vma->vm_flags & vm_bad_flags))
+			goto out;
+
+		if (opt == PR_SET_MM_START_CODE)
+			mm->start_code = addr;
+		else
+			mm->end_code = addr;
+		break;
+
+	case PR_SET_MM_START_DATA:
+	case PR_SET_MM_END_DATA:
+		vm_req_flags = VM_READ | VM_WRITE;
+		vm_bad_flags = VM_EXEC | VM_MAYSHARE;
+
+		if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+		    (vma->vm_flags & vm_bad_flags))
+			goto out;
+
+		if (opt == PR_SET_MM_START_DATA)
+			mm->start_data = addr;
+		else
+			mm->end_data = addr;
+		break;
+
+	case PR_SET_MM_START_STACK:
+
+#ifdef CONFIG_STACK_GROWSUP
+		vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
+#else
+		vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
+#endif
+		if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
+			goto out;
+
+		mm->start_stack = addr;
+		break;
+
+	case PR_SET_MM_START_BRK:
+		if (addr <= mm->end_data)
+			goto out;
+
+		if (rlim < RLIM_INFINITY &&
+		    (mm->brk - addr) +
+		    (mm->end_data - mm->start_data) > rlim)
+			goto out;
+
+		mm->start_brk = addr;
+		break;
+
+	case PR_SET_MM_BRK:
+		if (addr <= mm->end_data)
+			goto out;
+
+		if (rlim < RLIM_INFINITY &&
+		    (addr - mm->start_brk) +
+		    (mm->end_data - mm->start_data) > rlim)
+			goto out;
+
+		mm->brk = addr;
+		break;
+
+	default:
+		error = -EINVAL;
+		goto out;
+	}
+
+	error = 0;
+
+out:
+	up_read(&mm->mmap_sem);
+
+	return error;
+}
+#else /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_mm(int opt, unsigned long addr,
+			unsigned long arg4, unsigned long arg5)
+{
+	return -EINVAL;
+}
+#endif
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_SET_MM:
+			error = prctl_set_mm(arg2, arg3, arg4, arg5);
+			break;
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3-71-gd317


From dae5cbc2440b1d21a15715d0f1fb20f632dd38ee Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 14 Jan 2012 00:33:03 +0100
Subject: PM: Make sysrq-o be available for CONFIG_PM unset

After commit 1eb208aea3179dd2fc0cdeea45ef869d75b4fe70, "PM: Make
CONFIG_PM depend on (CONFIG_PM_SLEEP || CONFIG_PM_RUNTIME)", the
files under kernel/power are not built unless CONFIG_PM_SLEEP or
CONFIG_PM_RUNTIME is set.  In particular, this causes
kernel/power/poweroff.c to be omitted, even though it should be
compiled, because CONFIG_MAGIC_SYSRQ is set.

Fix the problem by causing kernel/power/Makefile to be processed
for CONFIG_PM unset too.

Reported-and-tested-by: Phil Oester <kernel@linuxace.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index f70396e5a24b..2d9de86b7e76 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_irq_work.o = -pg
 endif
 
 obj-y += sched/
+obj-y += power/
 
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
@@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
-obj-$(CONFIG_PM) += power/
-obj-$(CONFIG_FREEZER) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
-- 
cgit v1.2.3-71-gd317


From ee34a37049114303011e154478c63b977bcff24c Mon Sep 17 00:00:00 2001
From: Barry Song <Baohua.Song@csr.com>
Date: Mon, 9 Jan 2012 12:56:23 +0800
Subject: PM / Hibernate: Drop the check of swap space size for compressed
 image

For compressed image, the space required is not known until
we finish compressing and writing all pages.
This patch drops the check, and if swap space is not enough
finally, system can still restore to normal after writing
swap fails for compressed images.

Signed-off-by: Barry Song <Baohua.Song@csr.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/swap.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3739ecced085..8742fd013a94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -773,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
 
 	pr_debug("PM: Free swap pages: %u\n", free_swap);
 
-	required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
-		nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
+	required = PAGES_FOR_IO + nr_pages;
 	return free_swap > required;
 }
 
@@ -802,10 +801,12 @@ int swsusp_write(unsigned int flags)
 		printk(KERN_ERR "PM: Cannot get swap writer\n");
 		return error;
 	}
-	if (!enough_swap(pages, flags)) {
-		printk(KERN_ERR "PM: Not enough free swap\n");
-		error = -ENOSPC;
-		goto out_finish;
+	if (flags & SF_NOCOMPRESS_MODE) {
+		if (!enough_swap(pages, flags)) {
+			printk(KERN_ERR "PM: Not enough free swap\n");
+			error = -ENOSPC;
+			goto out_finish;
+		}
 	}
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
 	error = snapshot_read_next(&snapshot);
-- 
cgit v1.2.3-71-gd317


From 53999bf34d55981328f8ba9def558d3e104d6e36 Mon Sep 17 00:00:00 2001
From: Kevin Winchester <kjwinchester@gmail.com>
Date: Sun, 15 Jan 2012 19:32:55 -0400
Subject: error: implicit declaration of function 'module_flags_taint'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recent changes to kernel/module.c caused the following compile
error:

  kernel/module.c: In function ‘show_taint’:
  kernel/module.c:1024:2: error: implicit declaration of function ‘module_flags_taint’ [-Werror=implicit-function-declaration]
  cc1: some warnings being treated as errors

Correct this error by moving the definition of module_flags_taint
outside of the #ifdef CONFIG_MODULE_UNLOAD section.

Signed-off-by: Kevin Winchester <kjwinchester@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index acf6ed3ebe81..2c932760fd33 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -842,26 +842,6 @@ out:
 	return ret;
 }
 
-static size_t module_flags_taint(struct module *mod, char *buf)
-{
-	size_t l = 0;
-
-	if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
-		buf[l++] = 'P';
-	if (mod->taints & (1 << TAINT_OOT_MODULE))
-		buf[l++] = 'O';
-	if (mod->taints & (1 << TAINT_FORCED_MODULE))
-		buf[l++] = 'F';
-	if (mod->taints & (1 << TAINT_CRAP))
-		buf[l++] = 'C';
-	/*
-	 * TAINT_FORCED_RMMOD: could be added.
-	 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
-	 * apply to modules.
-	 */
-	return l;
-}
-
 static inline void print_unload_info(struct seq_file *m, struct module *mod)
 {
 	struct module_use *use;
@@ -962,6 +942,26 @@ static inline int module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
 
+static size_t module_flags_taint(struct module *mod, char *buf)
+{
+	size_t l = 0;
+
+	if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+		buf[l++] = 'P';
+	if (mod->taints & (1 << TAINT_OOT_MODULE))
+		buf[l++] = 'O';
+	if (mod->taints & (1 << TAINT_FORCED_MODULE))
+		buf[l++] = 'F';
+	if (mod->taints & (1 << TAINT_CRAP))
+		buf[l++] = 'C';
+	/*
+	 * TAINT_FORCED_RMMOD: could be added.
+	 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+	 * apply to modules.
+	 */
+	return l;
+}
+
 static ssize_t show_initstate(struct module_attribute *mattr,
 			      struct module_kobject *mk, char *buffer)
 {
-- 
cgit v1.2.3-71-gd317


From 951880e634a79884236a575b896abf55c39ae0bf Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 17 Jan 2012 10:19:41 -0800
Subject: Revert "capabitlies: ns_capable can use the cap helpers rather than
 lsm call"

This reverts commit d2a7009f0bb03fa22ad08dd25472efa0568126b9.

J. R. Okajima explains:

 "After this commit, I am afraid access(2) on NFS may not work
  correctly.  The scenario based upon my guess.
   - access(2) overrides the credentials.
   - calls inode_permission() -- ... -- generic_permission() --
      ns_capable().
   - while the old ns_capable() calls security_capable(current_cred()),
     the new ns_capable() calls has_ns_capability(current) --
     security_capable(__task_cred(t)).

  current_cred() returns current->cred which is effective (overridden)
  credentials, but __task_cred(current) returns current->real_cred (the
  NFSD's credential).  And the overridden credentials by access(2) lost."

Requested-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/capability.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 0fcf1c14a297..3f1adb6c6470 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
 		BUG();
 	}
 
-	if (has_ns_capability(current, ns, cap)) {
+	if (security_capable(current_cred(), ns, cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return true;
 	}
-- 
cgit v1.2.3-71-gd317


From 5ef30ee53b187786e64bdc1f8109e39d17f2ce58 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:05 -0500
Subject: audit: make filetype matching consistent with other filters

Every other filter that matches part of the inodes list collected by audit
will match against any of the inodes on that list.  The filetype matching
however had a strange way of doing things.  It allowed userspace to
indicated if it should match on the first of the second name collected by
the kernel.  Name collection ordering seems like a kernel internal and
making userspace rules get that right just seems like a bad idea.  As it
turns out the userspace audit writers had no idea it was doing this and
thus never overloaded the value field.  The kernel always checked the first
name collected which for the tested rules was always correct.

This patch just makes the filetype matching like the major, minor, inode,
and LSM rules in that it will match against any of the names collected.  It
also changes the rule validation to reject the old unused rule types.

Noone knew it was there.  Noone used it.  Why keep around the extra code?

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditfilter.c |  4 ++--
 kernel/auditsc.c     | 19 +++++++++----------
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f8277c80d678..d94dde82c3c8 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -385,7 +385,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 				goto exit_free;
 			break;
 		case AUDIT_FILETYPE:
-			if ((f->val & ~S_IFMT) > S_IFMT)
+			if (f->val & ~S_IFMT)
 				goto exit_free;
 			break;
 		case AUDIT_INODE:
@@ -536,7 +536,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 				goto exit_free;
 			break;
 		case AUDIT_FILETYPE:
-			if ((f->val & ~S_IFMT) > S_IFMT)
+			if (f->val & ~S_IFMT)
 				goto exit_free;
 			break;
 		default:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e7fe2b0d29b3..a09c50317059 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -305,21 +305,20 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
 	}
 }
 
-static int audit_match_filetype(struct audit_context *ctx, int which)
+static int audit_match_filetype(struct audit_context *ctx, int val)
 {
-	unsigned index = which & ~S_IFMT;
-	umode_t mode = which & S_IFMT;
+	int index;
+	umode_t mode = (umode_t)val;
 
 	if (unlikely(!ctx))
 		return 0;
 
-	if (index >= ctx->name_count)
-		return 0;
-	if (ctx->names[index].ino == -1)
-		return 0;
-	if ((ctx->names[index].mode ^ mode) & S_IFMT)
-		return 0;
-	return 1;
+	for (index = 0; index < ctx->name_count; index++) {
+		if ((ctx->names[index].ino != -1) &&
+		    ((ctx->names[index].mode & S_IFMT) == mode))
+			return 1;
+	}
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3-71-gd317


From 5195d8e217a78697152d64fc09a16e063a022465 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:05 -0500
Subject: audit: dynamically allocate audit_names when not enough space is in
 the names array

This patch does 2 things.  First it reduces the number of audit_names
allocated in every audit context from 20 to 5.  5 should be enough for all
'normal' syscalls (rename being the worst).  Some syscalls can still touch
more the 5 inodes such as mount.  When rpc filesystem is mounted it will
create inodes and those can exceed 5.  To handle that problem this patch will
dynamically allocate audit_names if it needs more than 5.  This should
decrease the typicall memory usage while still supporting all the possible
kernel operations.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditsc.c | 403 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 215 insertions(+), 188 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a09c50317059..1a92d61ddd27 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -71,8 +71,9 @@
 #include "audit.h"
 
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). */
-#define AUDIT_NAMES    20
+ * for saving names from getname().  If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES	5
 
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1
@@ -101,9 +102,8 @@ struct audit_cap_data {
  *
  * Further, in fs/namei.c:path_lookup() we store the inode and device. */
 struct audit_names {
+	struct list_head list;		/* audit_context->names_list */
 	const char	*name;
-	int		name_len;	/* number of name's characters to log */
-	unsigned	name_put;	/* call __putname() for this name */
 	unsigned long	ino;
 	dev_t		dev;
 	umode_t		mode;
@@ -113,6 +113,14 @@ struct audit_names {
 	u32		osid;
 	struct audit_cap_data fcap;
 	unsigned int	fcap_ver;
+	int		name_len;	/* number of name's characters to log */
+	bool		name_put;	/* call __putname() for this name */
+	/*
+	 * This was an allocated audit_names and not from the array of
+	 * names allocated in the task audit context.  Thus this name
+	 * should be freed on syscall exit
+	 */
+	bool		should_free;
 };
 
 struct audit_aux_data {
@@ -174,8 +182,17 @@ struct audit_context {
 	long		    return_code;/* syscall return code */
 	u64		    prio;
 	int		    return_valid; /* return code is valid */
-	int		    name_count;
-	struct audit_names  names[AUDIT_NAMES];
+	/*
+	 * The names_list is the list of all audit_names collected during this
+	 * syscall.  The first AUDIT_NAMES entries in the names_list will
+	 * actually be from the preallocated_names array for performance
+	 * reasons.  Except during allocation they should never be referenced
+	 * through the preallocated_names array and should only be found/used
+	 * by running the names_list.
+	 */
+	struct audit_names  preallocated_names[AUDIT_NAMES];
+	int		    name_count; /* total records in names_list */
+	struct list_head    names_list;	/* anchor for struct audit_names->list */
 	char *		    filterkey;	/* key for rule that triggered record */
 	struct path	    pwd;
 	struct audit_context *previous; /* For nested syscalls */
@@ -307,17 +324,18 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
 
 static int audit_match_filetype(struct audit_context *ctx, int val)
 {
-	int index;
+	struct audit_names *n;
 	umode_t mode = (umode_t)val;
 
 	if (unlikely(!ctx))
 		return 0;
 
-	for (index = 0; index < ctx->name_count; index++) {
-		if ((ctx->names[index].ino != -1) &&
-		    ((ctx->names[index].mode & S_IFMT) == mode))
+	list_for_each_entry(n, &ctx->names_list, list) {
+		if ((n->ino != -1) &&
+		    ((n->mode & S_IFMT) == mode))
 			return 1;
 	}
+
 	return 0;
 }
 
@@ -456,13 +474,14 @@ static int audit_filter_rules(struct task_struct *tsk,
 			      bool task_creation)
 {
 	const struct cred *cred;
-	int i, j, need_sid = 1;
+	int i, need_sid = 1;
 	u32 sid;
 
 	cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
 
 	for (i = 0; i < rule->field_count; i++) {
 		struct audit_field *f = &rule->fields[i];
+		struct audit_names *n;
 		int result = 0;
 
 		switch (f->type) {
@@ -525,8 +544,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 				result = audit_comparator(MAJOR(name->dev),
 							  f->op, f->val);
 			else if (ctx) {
-				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(MAJOR(ctx->names[j].dev),	f->op, f->val)) {
+				list_for_each_entry(n, &ctx->names_list, list) {
+					if (audit_comparator(MAJOR(n->dev), f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -538,8 +557,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 				result = audit_comparator(MINOR(name->dev),
 							  f->op, f->val);
 			else if (ctx) {
-				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
+				list_for_each_entry(n, &ctx->names_list, list) {
+					if (audit_comparator(MINOR(n->dev), f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -550,8 +569,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 			if (name)
 				result = (name->ino == f->val);
 			else if (ctx) {
-				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
+				list_for_each_entry(n, &ctx->names_list, list) {
+					if (audit_comparator(n->ino, f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -606,11 +625,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 					           name->osid, f->type, f->op,
 					           f->lsm_rule, ctx);
 				} else if (ctx) {
-					for (j = 0; j < ctx->name_count; j++) {
-						if (security_audit_rule_match(
-						      ctx->names[j].osid,
-						      f->type, f->op,
-						      f->lsm_rule, ctx)) {
+					list_for_each_entry(n, &ctx->names_list, list) {
+						if (security_audit_rule_match(n->osid, f->type,
+									      f->op, f->lsm_rule,
+									      ctx)) {
 							++result;
 							break;
 						}
@@ -721,40 +739,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 	return AUDIT_BUILD_CONTEXT;
 }
 
-/* At syscall exit time, this filter is called if any audit_names[] have been
+/*
+ * Given an audit_name check the inode hash table to see if they match.
+ * Called holding the rcu read lock to protect the use of audit_inode_hash
+ */
+static int audit_filter_inode_name(struct task_struct *tsk,
+				   struct audit_names *n,
+				   struct audit_context *ctx) {
+	int word, bit;
+	int h = audit_hash_ino((u32)n->ino);
+	struct list_head *list = &audit_inode_hash[h];
+	struct audit_entry *e;
+	enum audit_state state;
+
+	word = AUDIT_WORD(ctx->major);
+	bit  = AUDIT_BIT(ctx->major);
+
+	if (list_empty(list))
+		return 0;
+
+	list_for_each_entry_rcu(e, list, list) {
+		if ((e->rule.mask[word] & bit) == bit &&
+		    audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
+			ctx->current_state = state;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/* At syscall exit time, this filter is called if any audit_names have been
  * collected during syscall processing.  We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names[].
+ * buckets applicable to the inode numbers in audit_names.
  * Regarding audit_state, same rules apply as for audit_filter_syscall().
  */
 void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 {
-	int i;
-	struct audit_entry *e;
-	enum audit_state state;
+	struct audit_names *n;
 
 	if (audit_pid && tsk->tgid == audit_pid)
 		return;
 
 	rcu_read_lock();
-	for (i = 0; i < ctx->name_count; i++) {
-		int word = AUDIT_WORD(ctx->major);
-		int bit  = AUDIT_BIT(ctx->major);
-		struct audit_names *n = &ctx->names[i];
-		int h = audit_hash_ino((u32)n->ino);
-		struct list_head *list = &audit_inode_hash[h];
 
-		if (list_empty(list))
-			continue;
-
-		list_for_each_entry_rcu(e, list, list) {
-			if ((e->rule.mask[word] & bit) == bit &&
-			    audit_filter_rules(tsk, &e->rule, ctx, n,
-				    	       &state, false)) {
-				rcu_read_unlock();
-				ctx->current_state = state;
-				return;
-			}
-		}
+	list_for_each_entry(n, &ctx->names_list, list) {
+		if (audit_filter_inode_name(tsk, n, ctx))
+			break;
 	}
 	rcu_read_unlock();
 }
@@ -798,7 +829,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 
 static inline void audit_free_names(struct audit_context *context)
 {
-	int i;
+	struct audit_names *n, *next;
 
 #if AUDIT_DEBUG == 2
 	if (context->put_count + context->ino_count != context->name_count) {
@@ -809,10 +840,9 @@ static inline void audit_free_names(struct audit_context *context)
 		       context->serial, context->major, context->in_syscall,
 		       context->name_count, context->put_count,
 		       context->ino_count);
-		for (i = 0; i < context->name_count; i++) {
+		list_for_each_entry(n, &context->names_list, list) {
 			printk(KERN_ERR "names[%d] = %p = %s\n", i,
-			       context->names[i].name,
-			       context->names[i].name ?: "(null)");
+			       n->name, n->name ?: "(null)");
 		}
 		dump_stack();
 		return;
@@ -823,9 +853,12 @@ static inline void audit_free_names(struct audit_context *context)
 	context->ino_count  = 0;
 #endif
 
-	for (i = 0; i < context->name_count; i++) {
-		if (context->names[i].name && context->names[i].name_put)
-			__putname(context->names[i].name);
+	list_for_each_entry_safe(n, next, &context->names_list, list) {
+		list_del(&n->list);
+		if (n->name && n->name_put)
+			__putname(n->name);
+		if (n->should_free)
+			kfree(n);
 	}
 	context->name_count = 0;
 	path_put(&context->pwd);
@@ -863,6 +896,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
 		return NULL;
 	audit_zero_context(context, state);
 	INIT_LIST_HEAD(&context->killed_trees);
+	INIT_LIST_HEAD(&context->names_list);
 	return context;
 }
 
@@ -1323,6 +1357,68 @@ static void show_special(struct audit_context *context, int *call_panic)
 	audit_log_end(ab);
 }
 
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+			   int record_num, int *call_panic)
+{
+	struct audit_buffer *ab;
+	ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+	if (!ab)
+		return; /* audit_panic has been called */
+
+	audit_log_format(ab, "item=%d", record_num);
+
+	if (n->name) {
+		switch (n->name_len) {
+		case AUDIT_NAME_FULL:
+			/* log the full path */
+			audit_log_format(ab, " name=");
+			audit_log_untrustedstring(ab, n->name);
+			break;
+		case 0:
+			/* name was specified as a relative path and the
+			 * directory component is the cwd */
+			audit_log_d_path(ab, "name=", &context->pwd);
+			break;
+		default:
+			/* log the name's directory component */
+			audit_log_format(ab, " name=");
+			audit_log_n_untrustedstring(ab, n->name,
+						    n->name_len);
+		}
+	} else
+		audit_log_format(ab, " name=(null)");
+
+	if (n->ino != (unsigned long)-1) {
+		audit_log_format(ab, " inode=%lu"
+				 " dev=%02x:%02x mode=%#ho"
+				 " ouid=%u ogid=%u rdev=%02x:%02x",
+				 n->ino,
+				 MAJOR(n->dev),
+				 MINOR(n->dev),
+				 n->mode,
+				 n->uid,
+				 n->gid,
+				 MAJOR(n->rdev),
+				 MINOR(n->rdev));
+	}
+	if (n->osid != 0) {
+		char *ctx = NULL;
+		u32 len;
+		if (security_secid_to_secctx(
+			n->osid, &ctx, &len)) {
+			audit_log_format(ab, " osid=%u", n->osid);
+			*call_panic = 2;
+		} else {
+			audit_log_format(ab, " obj=%s", ctx);
+			security_release_secctx(ctx, len);
+		}
+	}
+
+	audit_log_fcaps(ab, n);
+
+	audit_log_end(ab);
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	const struct cred *cred;
@@ -1330,6 +1426,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
 	const char *tty;
+	struct audit_names *n;
 
 	/* tsk == current */
 	context->pid = tsk->pid;
@@ -1469,66 +1566,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_end(ab);
 		}
 	}
-	for (i = 0; i < context->name_count; i++) {
-		struct audit_names *n = &context->names[i];
-
-		ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
-		if (!ab)
-			continue; /* audit_panic has been called */
-
-		audit_log_format(ab, "item=%d", i);
-
-		if (n->name) {
-			switch(n->name_len) {
-			case AUDIT_NAME_FULL:
-				/* log the full path */
-				audit_log_format(ab, " name=");
-				audit_log_untrustedstring(ab, n->name);
-				break;
-			case 0:
-				/* name was specified as a relative path and the
-				 * directory component is the cwd */
-				audit_log_d_path(ab, "name=", &context->pwd);
-				break;
-			default:
-				/* log the name's directory component */
-				audit_log_format(ab, " name=");
-				audit_log_n_untrustedstring(ab, n->name,
-							    n->name_len);
-			}
-		} else
-			audit_log_format(ab, " name=(null)");
-
-		if (n->ino != (unsigned long)-1) {
-			audit_log_format(ab, " inode=%lu"
-					 " dev=%02x:%02x mode=%#ho"
-					 " ouid=%u ogid=%u rdev=%02x:%02x",
-					 n->ino,
-					 MAJOR(n->dev),
-					 MINOR(n->dev),
-					 n->mode,
-					 n->uid,
-					 n->gid,
-					 MAJOR(n->rdev),
-					 MINOR(n->rdev));
-		}
-		if (n->osid != 0) {
-			char *ctx = NULL;
-			u32 len;
-			if (security_secid_to_secctx(
-				n->osid, &ctx, &len)) {
-				audit_log_format(ab, " osid=%u", n->osid);
-				call_panic = 2;
-			} else {
-				audit_log_format(ab, " obj=%s", ctx);
-				security_release_secctx(ctx, len);
-			}
-		}
-
-		audit_log_fcaps(ab, n);
 
-		audit_log_end(ab);
-	}
+	i = 0;
+	list_for_each_entry(n, &context->names_list, list)
+		audit_log_name(context, n, i++, &call_panic);
 
 	/* Send end of event record to help user space know we are finished */
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1820,6 +1861,30 @@ retry:
 #endif
 }
 
+static struct audit_names *audit_alloc_name(struct audit_context *context)
+{
+	struct audit_names *aname;
+
+	if (context->name_count < AUDIT_NAMES) {
+		aname = &context->preallocated_names[context->name_count];
+		memset(aname, 0, sizeof(*aname));
+	} else {
+		aname = kzalloc(sizeof(*aname), GFP_NOFS);
+		if (!aname)
+			return NULL;
+		aname->should_free = true;
+	}
+
+	aname->ino = (unsigned long)-1;
+	list_add_tail(&aname->list, &context->names_list);
+
+	context->name_count++;
+#if AUDIT_DEBUG
+	context->ino_count++;
+#endif
+	return aname;
+}
+
 /**
  * audit_getname - add a name to the list
  * @name: name to add
@@ -1830,6 +1895,7 @@ retry:
 void __audit_getname(const char *name)
 {
 	struct audit_context *context = current->audit_context;
+	struct audit_names *n;
 
 	if (IS_ERR(name) || !name)
 		return;
@@ -1842,13 +1908,15 @@ void __audit_getname(const char *name)
 #endif
 		return;
 	}
-	BUG_ON(context->name_count >= AUDIT_NAMES);
-	context->names[context->name_count].name = name;
-	context->names[context->name_count].name_len = AUDIT_NAME_FULL;
-	context->names[context->name_count].name_put = 1;
-	context->names[context->name_count].ino  = (unsigned long)-1;
-	context->names[context->name_count].osid = 0;
-	++context->name_count;
+
+	n = audit_alloc_name(context);
+	if (!n)
+		return;
+
+	n->name = name;
+	n->name_len = AUDIT_NAME_FULL;
+	n->name_put = true;
+
 	if (!context->pwd.dentry)
 		get_fs_pwd(current->fs, &context->pwd);
 }
@@ -1870,12 +1938,13 @@ void audit_putname(const char *name)
 		printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
 		       __FILE__, __LINE__, context->serial, name);
 		if (context->name_count) {
+			struct audit_names *n;
 			int i;
-			for (i = 0; i < context->name_count; i++)
+
+			list_for_each_entry(n, &context->names_list, list)
 				printk(KERN_ERR "name[%d] = %p = %s\n", i,
-				       context->names[i].name,
-				       context->names[i].name ?: "(null)");
-		}
+				       n->name, n->name ?: "(null)");
+			}
 #endif
 		__putname(name);
 	}
@@ -1896,39 +1965,11 @@ void audit_putname(const char *name)
 #endif
 }
 
-static int audit_inc_name_count(struct audit_context *context,
-				const struct inode *inode)
-{
-	if (context->name_count >= AUDIT_NAMES) {
-		if (inode)
-			printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
-			       "dev=%02x:%02x, inode=%lu\n",
-			       MAJOR(inode->i_sb->s_dev),
-			       MINOR(inode->i_sb->s_dev),
-			       inode->i_ino);
-
-		else
-			printk(KERN_DEBUG "name_count maxed, losing inode data\n");
-		return 1;
-	}
-	context->name_count++;
-#if AUDIT_DEBUG
-	context->ino_count++;
-#endif
-	return 0;
-}
-
-
 static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
 {
 	struct cpu_vfs_cap_data caps;
 	int rc;
 
-	memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
-	memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
-	name->fcap.fE = 0;
-	name->fcap_ver = 0;
-
 	if (!dentry)
 		return 0;
 
@@ -1968,30 +2009,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
  */
 void __audit_inode(const char *name, const struct dentry *dentry)
 {
-	int idx;
 	struct audit_context *context = current->audit_context;
 	const struct inode *inode = dentry->d_inode;
+	struct audit_names *n;
 
 	if (!context->in_syscall)
 		return;
-	if (context->name_count
-	    && context->names[context->name_count-1].name
-	    && context->names[context->name_count-1].name == name)
-		idx = context->name_count - 1;
-	else if (context->name_count > 1
-		 && context->names[context->name_count-2].name
-		 && context->names[context->name_count-2].name == name)
-		idx = context->name_count - 2;
-	else {
-		/* FIXME: how much do we care about inodes that have no
-		 * associated name? */
-		if (audit_inc_name_count(context, inode))
-			return;
-		idx = context->name_count - 1;
-		context->names[idx].name = NULL;
+
+	list_for_each_entry_reverse(n, &context->names_list, list) {
+		if (n->name && (n->name == name))
+			goto out;
 	}
+
+	/* unable to find the name from a previous getname() */
+	n = audit_alloc_name(context);
+	if (!n)
+		return;
+out:
 	handle_path(dentry);
-	audit_copy_inode(&context->names[idx], dentry, inode);
+	audit_copy_inode(n, dentry, inode);
 }
 
 /**
@@ -2010,11 +2046,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 void __audit_inode_child(const struct dentry *dentry,
 			 const struct inode *parent)
 {
-	int idx;
 	struct audit_context *context = current->audit_context;
 	const char *found_parent = NULL, *found_child = NULL;
 	const struct inode *inode = dentry->d_inode;
 	const char *dname = dentry->d_name.name;
+	struct audit_names *n;
 	int dirlen = 0;
 
 	if (!context->in_syscall)
@@ -2024,9 +2060,7 @@ void __audit_inode_child(const struct dentry *dentry,
 		handle_one(inode);
 
 	/* parent is more likely, look for it first */
-	for (idx = 0; idx < context->name_count; idx++) {
-		struct audit_names *n = &context->names[idx];
-
+	list_for_each_entry(n, &context->names_list, list) {
 		if (!n->name)
 			continue;
 
@@ -2039,9 +2073,7 @@ void __audit_inode_child(const struct dentry *dentry,
 	}
 
 	/* no matching parent, look for matching child */
-	for (idx = 0; idx < context->name_count; idx++) {
-		struct audit_names *n = &context->names[idx];
-
+	list_for_each_entry(n, &context->names_list, list) {
 		if (!n->name)
 			continue;
 
@@ -2059,34 +2091,29 @@ void __audit_inode_child(const struct dentry *dentry,
 
 add_names:
 	if (!found_parent) {
-		if (audit_inc_name_count(context, parent))
+		n = audit_alloc_name(context);
+		if (!n)
 			return;
-		idx = context->name_count - 1;
-		context->names[idx].name = NULL;
-		audit_copy_inode(&context->names[idx], NULL, parent);
+		audit_copy_inode(n, NULL, parent);
 	}
 
 	if (!found_child) {
-		if (audit_inc_name_count(context, inode))
+		n = audit_alloc_name(context);
+		if (!n)
 			return;
-		idx = context->name_count - 1;
 
 		/* Re-use the name belonging to the slot for a matching parent
 		 * directory. All names for this context are relinquished in
 		 * audit_free_names() */
 		if (found_parent) {
-			context->names[idx].name = found_parent;
-			context->names[idx].name_len = AUDIT_NAME_FULL;
+			n->name = found_parent;
+			n->name_len = AUDIT_NAME_FULL;
 			/* don't call __putname() */
-			context->names[idx].name_put = 0;
-		} else {
-			context->names[idx].name = NULL;
+			n->name_put = false;
 		}
 
 		if (inode)
-			audit_copy_inode(&context->names[idx], NULL, inode);
-		else
-			context->names[idx].ino = (unsigned long)-1;
+			audit_copy_inode(n, NULL, inode);
 	}
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
-- 
cgit v1.2.3-71-gd317


From 3035c51e8ac0512686ceb9f2bd1d13bdc6e4fb29 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:05 -0500
Subject: audit: drop the meaningless and format breaking word 'user'

userspace audit messages look like so:

type=USER msg=audit(1271170549.415:24710): user pid=14722 uid=0 auid=500 ses=1 subj=unconfined_u:unconfined_r:auditctl_t:s0-s0:c0.c1023 msg=''

That third field just says 'user'.  That's useless and doesn't follow the
key=value pair we are trying to enforce.  We already know it came from the
user based on the record type.  Kill that word.  Die.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 2c1d6ab7106e..00efe4758c86 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
 	}
 
 	*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
-	audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
+	audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
 			 pid, uid, auid, ses);
 	if (sid) {
 		rc = security_secid_to_secctx(sid, &ctx, &len);
-- 
cgit v1.2.3-71-gd317


From 16c174bd95cb07c9d0ad3fcd8c70f9cea7214c9d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:05 -0500
Subject: audit: check current inode and containing object when filtering on
 major and minor

The audit system has the ability to filter on the major and minor number of
the device containing the inode being operated upon.  Lets say that
/dev/sda1 has major,minor 8,1 and that we mount /dev/sda1 on /boot.  Now lets
say we add a watch with a filter on 8,1.  If we proceed to open an inode
inside /boot, such as /vboot/vmlinuz, we will match the major,minor filter.

Lets instead assume that one were to use a tool like debugfs and were to
open /dev/sda1 directly and to modify it's contents.  We might hope that
this would also be logged, but it isn't.  The rules will check the
major,minor of the device containing /dev/sda1.  In other words the rule
would match on the major/minor of the tmpfs mounted at /dev.

I believe these rules should trigger on either device.  The man page is
devoid of useful information about the intended semantics.  It only seems
logical that if you want to know everything that happened on a major,minor
that would include things that happened to the device itself...

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditsc.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1a92d61ddd27..7c495147c3d9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -540,12 +540,14 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_DEVMAJOR:
-			if (name)
-				result = audit_comparator(MAJOR(name->dev),
-							  f->op, f->val);
-			else if (ctx) {
+			if (name) {
+				if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
+				    audit_comparator(MAJOR(name->rdev), f->op, f->val))
+					++result;
+			} else if (ctx) {
 				list_for_each_entry(n, &ctx->names_list, list) {
-					if (audit_comparator(MAJOR(n->dev), f->op, f->val)) {
+					if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
+					    audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -553,12 +555,14 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_DEVMINOR:
-			if (name)
-				result = audit_comparator(MINOR(name->dev),
-							  f->op, f->val);
-			else if (ctx) {
+			if (name) {
+				if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
+				    audit_comparator(MINOR(name->rdev), f->op, f->val))
+					++result;
+			} else if (ctx) {
 				list_for_each_entry(n, &ctx->names_list, list) {
-					if (audit_comparator(MINOR(n->dev), f->op, f->val)) {
+					if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
+					    audit_comparator(MINOR(n->rdev), f->op, f->val)) {
 						++result;
 						break;
 					}
-- 
cgit v1.2.3-71-gd317


From 85e7bac33b8d5edafc4e219c7dfdb3d48e0b4e31 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:05 -0500
Subject: seccomp: audit abnormal end to a process due to seccomp

The audit system likes to collect information about processes that end
abnormally (SIGSEGV) as this may me useful intrusion detection information.
This patch adds audit support to collect information when seccomp forces a
task to exit because of misbehavior in a similar way.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  8 ++++++++
 kernel/auditsc.c      | 50 +++++++++++++++++++++++++++++---------------------
 kernel/seccomp.c      |  2 ++
 3 files changed, 39 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 426ab9f4dd85..6e1c533f9b46 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -430,6 +430,7 @@ extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct dentry *dentry);
 extern void __audit_inode_child(const struct dentry *dentry,
 				const struct inode *parent);
+extern void __audit_seccomp(unsigned long syscall);
 extern void __audit_ptrace(struct task_struct *t);
 
 static inline int audit_dummy_context(void)
@@ -453,6 +454,12 @@ static inline void audit_inode_child(const struct dentry *dentry,
 }
 void audit_core_dumps(long signr);
 
+static inline void audit_seccomp(unsigned long syscall)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_seccomp(syscall);
+}
+
 static inline void audit_ptrace(struct task_struct *t)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -558,6 +565,7 @@ extern int audit_signals;
 #define audit_inode(n,d) do { (void)(d); } while (0)
 #define audit_inode_child(i,p) do { ; } while (0)
 #define audit_core_dumps(i) do { ; } while (0)
+#define audit_seccomp(i) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) (0)
 #define audit_get_loginuid(t) (-1)
 #define audit_get_sessionid(t) (-1)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7c495147c3d9..e9bcb93800d8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2529,6 +2529,25 @@ void __audit_mmap_fd(int fd, int flags)
 	context->type = AUDIT_MMAP;
 }
 
+static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+{
+	uid_t auid, uid;
+	gid_t gid;
+	unsigned int sessionid;
+
+	auid = audit_get_loginuid(current);
+	sessionid = audit_get_sessionid(current);
+	current_uid_gid(&uid, &gid);
+
+	audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+			 auid, uid, gid, sessionid);
+	audit_log_task_context(ab);
+	audit_log_format(ab, " pid=%d comm=", current->pid);
+	audit_log_untrustedstring(ab, current->comm);
+	audit_log_format(ab, " reason=");
+	audit_log_string(ab, reason);
+	audit_log_format(ab, " sig=%ld", signr);
+}
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
@@ -2539,10 +2558,6 @@ void __audit_mmap_fd(int fd, int flags)
 void audit_core_dumps(long signr)
 {
 	struct audit_buffer *ab;
-	u32 sid;
-	uid_t auid = audit_get_loginuid(current), uid;
-	gid_t gid;
-	unsigned int sessionid = audit_get_sessionid(current);
 
 	if (!audit_enabled)
 		return;
@@ -2551,24 +2566,17 @@ void audit_core_dumps(long signr)
 		return;
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
-	current_uid_gid(&uid, &gid);
-	audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
-			 auid, uid, gid, sessionid);
-	security_task_getsecid(current, &sid);
-	if (sid) {
-		char *ctx = NULL;
-		u32 len;
+	audit_log_abend(ab, "memory violation", signr);
+	audit_log_end(ab);
+}
 
-		if (security_secid_to_secctx(sid, &ctx, &len))
-			audit_log_format(ab, " ssid=%u", sid);
-		else {
-			audit_log_format(ab, " subj=%s", ctx);
-			security_release_secctx(ctx, len);
-		}
-	}
-	audit_log_format(ab, " pid=%d comm=", current->pid);
-	audit_log_untrustedstring(ab, current->comm);
-	audit_log_format(ab, " sig=%ld", signr);
+void __audit_seccomp(unsigned long syscall)
+{
+	struct audit_buffer *ab;
+
+	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+	audit_log_abend(ab, "seccomp", SIGKILL);
+	audit_log_format(ab, " syscall=%ld", syscall);
 	audit_log_end(ab);
 }
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13b631d..e8d76c5895ea 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,6 +6,7 @@
  * This defines a simple but solid secure-computing mode.
  */
 
+#include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
 #include <linux/compat.h>
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall)
 #ifdef SECCOMP_DEBUG
 	dump_stack();
 #endif
+	audit_seccomp(this_syscall);
 	do_exit(SIGKILL);
 }
 
-- 
cgit v1.2.3-71-gd317


From d7e7528bcd456f5c36ad4a202ccfb43c5aa98bc4 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: Audit: push audit success and retcode into arch ptrace.h

The audit system previously expected arches calling to audit_syscall_exit to
supply as arguments if the syscall was a success and what the return code was.
Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things
by converting from negative retcodes to an audit internal magic value stating
success or failure.  This helper was wrong and could indicate that a valid
pointer returned to userspace was a failed syscall.  The fix is to fix the
layering foolishness.  We now pass audit_syscall_exit a struct pt_reg and it
in turns calls back into arch code to collect the return value and to
determine if the syscall was a success or failure.  We also define a generic
is_syscall_success() macro which determines success/failure based on if the
value is < -MAX_ERRNO.  This works for arches like x86 which do not use a
separate mechanism to indicate syscall failure.

We make both the is_syscall_success() and regs_return_value() static inlines
instead of macros.  The reason is because the audit function must take a void*
for the regs.  (uml calls theirs struct uml_pt_regs instead of just struct
pt_regs so audit_syscall_exit can't take a struct pt_regs).  Since the audit
function takes a void* we need to use static inlines to cast it back to the
arch correct structure to dereference it.

The other major change is that on some arches, like ia64, MIPS and ppc, we
change regs_return_value() to give us the negative value on syscall failure.
THE only other user of this macro, kretprobe_example.c, won't notice and it
makes the value signed consistently for the audit functions across all archs.

In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old
audit code as the return value.  But the ptrace_64.h code defined the macro
regs_return_value() as regs[3].  I have no idea which one is correct, but this
patch now uses the regs_return_value() function, so it now uses regs[3].

For powerpc we previously used regs->result but now use the
regs_return_value() function which uses regs->gprs[3].  regs->gprs[3] is
always positive so the regs_return_value(), much like ia64 makes it negative
before calling the audit code when appropriate.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com> [for x86 portion]
Acked-by: Tony Luck <tony.luck@intel.com> [for ia64]
Acked-by: Richard Weinberger <richard@nod.at> [for uml]
Acked-by: David S. Miller <davem@davemloft.net> [for sparc]
Acked-by: Ralf Baechle <ralf@linux-mips.org> [for mips]
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [for ppc]
---
 arch/ia64/include/asm/ptrace.h       | 13 ++++++++++++-
 arch/ia64/kernel/ptrace.c            |  9 +--------
 arch/microblaze/include/asm/ptrace.h |  5 +++++
 arch/microblaze/kernel/ptrace.c      |  3 +--
 arch/mips/include/asm/ptrace.h       | 14 +++++++++++++-
 arch/mips/kernel/ptrace.c            |  4 +---
 arch/powerpc/include/asm/ptrace.h    | 13 ++++++++++++-
 arch/powerpc/kernel/ptrace.c         |  4 +---
 arch/s390/include/asm/ptrace.h       |  6 +++++-
 arch/s390/kernel/ptrace.c            |  4 +---
 arch/sh/include/asm/ptrace_32.h      |  5 ++++-
 arch/sh/include/asm/ptrace_64.h      |  5 ++++-
 arch/sh/kernel/ptrace_32.c           |  4 +---
 arch/sh/kernel/ptrace_64.c           |  4 +---
 arch/sparc/include/asm/ptrace.h      | 10 +++++++++-
 arch/sparc/kernel/ptrace_64.c        | 11 +----------
 arch/um/kernel/ptrace.c              |  4 ++--
 arch/x86/ia32/ia32entry.S            | 10 +++++-----
 arch/x86/kernel/entry_32.S           |  8 ++++----
 arch/x86/kernel/entry_64.S           | 10 +++++-----
 arch/x86/kernel/ptrace.c             |  3 +--
 arch/x86/kernel/vm86_32.c            |  4 ++--
 arch/x86/um/shared/sysdep/ptrace.h   |  5 +++++
 include/linux/audit.h                | 22 ++++++++++++++--------
 include/linux/ptrace.h               | 10 ++++++++++
 kernel/auditsc.c                     | 16 ++++++++++++----
 26 files changed, 132 insertions(+), 74 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h
index f5cb27614e35..68c98f5b3ca6 100644
--- a/arch/ia64/include/asm/ptrace.h
+++ b/arch/ia64/include/asm/ptrace.h
@@ -246,7 +246,18 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
 	return regs->ar_bspstore;
 }
 
-#define regs_return_value(regs) ((regs)->r8)
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return regs->r10 != -1;
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->r8;
+	else
+		return -regs->r8;
+}
 
 /* Conserve space in histogram by encoding slot bits in address
  * bits 2 and 3 rather than bits 0 and 1.
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 8848f43d819e..2c154088cce7 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1268,14 +1268,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
 {
 	int step;
 
-	if (unlikely(current->audit_context)) {
-		int success = AUDITSC_RESULT(regs.r10);
-		long result = regs.r8;
-
-		if (success != AUDITSC_SUCCESS)
-			result = -result;
-		audit_syscall_exit(success, result);
-	}
+	audit_syscall_exit(&regs);
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h
index 816bee64b196..94e92c805859 100644
--- a/arch/microblaze/include/asm/ptrace.h
+++ b/arch/microblaze/include/asm/ptrace.h
@@ -61,6 +61,11 @@ struct pt_regs {
 #define instruction_pointer(regs)	((regs)->pc)
 #define profile_pc(regs)		instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->r3;
+}
+
 #else /* __KERNEL__ */
 
 /* pt_regs offsets used by gdbserver etc in ptrace syscalls */
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index 043cb58f9c44..f564b1bfd386 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -159,8 +159,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3);
+	audit_syscall_exit(regs);
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index de39b1f343ea..7d409505df2d 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -137,7 +137,19 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
  */
 #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER)
 
-#define regs_return_value(_regs) ((_regs)->regs[2])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !regs->regs[7];
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->regs[2];
+	else
+		return -regs->regs[2];
+}
+
 #define instruction_pointer(regs) ((regs)->cp0_epc)
 #define profile_pc(regs) instruction_pointer(regs)
 
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 4e6ea1ffad46..ab0f1963a7bd 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -572,9 +572,7 @@ out:
  */
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]),
-		                   -regs->regs[2]);
+	audit_syscall_exit(regs);
 
 	if (!(current->ptrace & PT_PTRACED))
 		return;
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index 48223f9b8728..78a205162fd7 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -86,7 +86,18 @@ struct pt_regs {
 #define instruction_pointer(regs) ((regs)->nip)
 #define user_stack_pointer(regs) ((regs)->gpr[1])
 #define kernel_stack_pointer(regs) ((regs)->gpr[1])
-#define regs_return_value(regs) ((regs)->gpr[3])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !(regs->ccr & 0x10000000);
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->gpr[3];
+	else
+		return -regs->gpr[3];
+}
 
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *regs);
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 5de73dbd15c7..09d31c12a5e3 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1748,9 +1748,7 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
-				   regs->result);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->result);
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index 56da355678f4..aeb77f017985 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -541,9 +541,13 @@ struct user_regs_struct
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
 #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
 #define user_stack_pointer(regs)((regs)->gprs[15])
-#define regs_return_value(regs)((regs)->gprs[2])
 #define profile_pc(regs) instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->gprs[2];
+}
+
 int regs_query_register_offset(const char *name);
 const char *regs_query_register_name(unsigned int offset);
 unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 573bc29551ef..f52758600980 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -751,9 +751,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 
 asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
 {
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
-				   regs->gprs[2]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->gprs[2]);
diff --git a/arch/sh/include/asm/ptrace_32.h b/arch/sh/include/asm/ptrace_32.h
index 6c2239cca1a2..2d3e906aa722 100644
--- a/arch/sh/include/asm/ptrace_32.h
+++ b/arch/sh/include/asm/ptrace_32.h
@@ -76,7 +76,10 @@ struct pt_dspregs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET		offsetof(struct pt_regs, tra)
-#define regs_return_value(_regs)	((_regs)->regs[0])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->regs[0];
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sh/include/asm/ptrace_64.h b/arch/sh/include/asm/ptrace_64.h
index bf9be7764d69..eb3fcceaf64b 100644
--- a/arch/sh/include/asm/ptrace_64.h
+++ b/arch/sh/include/asm/ptrace_64.h
@@ -13,7 +13,10 @@ struct pt_regs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET		offsetof(struct pt_regs, tregs[7])
-#define regs_return_value(_regs)	((_regs)->regs[3])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->regs[3];
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 92b3c276339a..c0b5c179d27b 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -530,9 +530,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]),
-				   regs->regs[0]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->regs[0]);
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index c8f97649f354..ba720d686435 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -548,9 +548,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]),
-				   regs->regs[9]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->regs[9]);
diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h
index a0e1bcf843a1..c00c3b5c2806 100644
--- a/arch/sparc/include/asm/ptrace.h
+++ b/arch/sparc/include/asm/ptrace.h
@@ -207,7 +207,15 @@ do {	current_thread_info()->syscall_noerror = 1; \
 #define instruction_pointer(regs) ((regs)->tpc)
 #define instruction_pointer_set(regs, val) ((regs)->tpc = (val))
 #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP])
-#define regs_return_value(regs) ((regs)->u_regs[UREG_I0])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY));
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->u_regs[UREG_I0];
+}
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *);
 #else
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 96ee50a80661..c73c8c50f117 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -1086,17 +1086,8 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-#ifdef CONFIG_AUDITSYSCALL
-	if (unlikely(current->audit_context)) {
-		unsigned long tstate = regs->tstate;
-		int result = AUDITSC_SUCCESS;
+	audit_syscall_exit(regs);
 
-		if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY)))
-			result = AUDITSC_FAILURE;
-
-		audit_syscall_exit(result, regs->u_regs[UREG_I0]);
-	}
-#endif
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->u_regs[UREG_G1]);
 
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index c9da32b0c707..2ccf25c42feb 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -175,8 +175,8 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
 					    UPT_SYSCALL_ARG2(regs),
 					    UPT_SYSCALL_ARG3(regs),
 					    UPT_SYSCALL_ARG4(regs));
-		else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)),
-					UPT_SYSCALL_RET(regs));
+		else
+			audit_syscall_exit(regs);
 	}
 
 	/* Fake a debug trap */
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 3e274564f6bf..64ced0b8f8fd 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -14,6 +14,7 @@
 #include <asm/segment.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -208,12 +209,11 @@ sysexit_from_sys_call:
 	TRACE_IRQS_ON
 	sti
 	movl %eax,%esi		/* second arg, syscall return value */
-	cmpl $0,%eax		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
-	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall return value */
+	call __audit_syscall_exit
+	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	cli
 	TRACE_IRQS_OFF
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 22d0e21b4dd7..a22facf06f0e 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -42,6 +42,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/err.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/errno.h>
@@ -466,11 +467,10 @@ sysexit_audit:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
 	movl %eax,%edx		/* second arg, syscall return value */
-	cmpl $0,%eax		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%eax		/* zero-extend that */
-	inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
+	call __audit_syscall_exit
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a20e1cb9dc87..e51393dd93a3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -563,17 +564,16 @@ auditsys:
 	jmp system_call_fastpath
 
 	/*
-	 * Return fast path for syscall audit.  Call audit_syscall_exit()
+	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
 	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
 	 * masked off.
 	 */
 sysret_audit:
 	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
-	cmpq $0,%rsi		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
+	call __audit_syscall_exit
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	jmp sysret_check
 #endif	/* CONFIG_AUDITSYSCALL */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 89a04c7b5bb6..8b0218758775 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1414,8 +1414,7 @@ void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->ax);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 863f8753ab0a..af17e1c966dc 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	if (info->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
-	/*call audit_syscall_exit since we do not exit via the normal paths */
+	/*call __audit_syscall_exit since we do not exit via the normal paths */
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(0), 0);
+		__audit_syscall_exit(1, 0);
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 711b1621747f..5ef9344a8b24 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -3,3 +3,8 @@
 #else
 #include "ptrace_64.h"
 #endif
+
+static inline long regs_return_value(struct uml_pt_regs *regs)
+{
+	return UPT_SYSCALL_RET(regs);
+}
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6e1c533f9b46..3d65e4b3ba06 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -26,6 +26,7 @@
 
 #include <linux/types.h>
 #include <linux/elf-em.h>
+#include <linux/ptrace.h>
 
 /* The netlink messages for the audit system is divided into blocks:
  * 1000 - 1099 are for commanding the audit system
@@ -408,10 +409,6 @@ struct audit_field {
 	void				*lsm_rule;
 };
 
-#define AUDITSC_INVALID 0
-#define AUDITSC_SUCCESS 1
-#define AUDITSC_FAILURE 2
-#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS )
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
@@ -424,7 +421,7 @@ extern void audit_free(struct task_struct *task);
 extern void audit_syscall_entry(int arch,
 				int major, unsigned long a0, unsigned long a1,
 				unsigned long a2, unsigned long a3);
-extern void audit_syscall_exit(int failed, long return_code);
+extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct dentry *dentry);
@@ -438,6 +435,15 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
+static inline void audit_syscall_exit(void *pt_regs)
+{
+	if (unlikely(current->audit_context)) {
+		int success = is_syscall_success(pt_regs);
+		int return_code = regs_return_value(pt_regs);
+
+		__audit_syscall_exit(success, return_code);
+	}
+}
 static inline void audit_getname(const char *name)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -551,12 +557,12 @@ static inline void audit_mmap_fd(int fd, int flags)
 
 extern int audit_n_rules;
 extern int audit_signals;
-#else
+#else /* CONFIG_AUDITSYSCALL */
 #define audit_finish_fork(t)
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
-#define audit_syscall_exit(f,r) do { ; } while (0)
+#define audit_syscall_exit(r) do { ; } while (0)
 #define audit_dummy_context() 1
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
@@ -587,7 +593,7 @@ extern int audit_signals;
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
-#endif
+#endif /* CONFIG_AUDITSYSCALL */
 
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 800f113bea66..dd4cefa6519d 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -112,6 +112,7 @@
 
 #include <linux/compiler.h>		/* For unlikely.  */
 #include <linux/sched.h>		/* For struct task_struct.  */
+#include <linux/err.h>			/* for IS_ERR_VALUE */
 
 
 extern long arch_ptrace(struct task_struct *child, long request,
@@ -265,6 +266,15 @@ static inline void ptrace_release_task(struct task_struct *task)
 #define force_successful_syscall_return() do { } while (0)
 #endif
 
+#ifndef is_syscall_success
+/*
+ * On most systems we can tell if a syscall is a success based on if the retval
+ * is an error value.  On some systems like ia64 and powerpc they have different
+ * indicators of success/failure and must define their own.
+ */
+#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs))))
+#endif
+
 /*
  * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
  *
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e9bcb93800d8..3d2853808185 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,6 +70,11 @@
 
 #include "audit.h"
 
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
+
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
  * for saving names from getname().  If we get more names we will allocate
  * a name dynamically and also add those to the list anchored by names_list. */
@@ -1724,8 +1729,7 @@ void audit_finish_fork(struct task_struct *child)
 
 /**
  * audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
- * @return_code: syscall return value
+ * @pt_regs: syscall registers
  *
  * Tear down after system call.  If the audit context has been marked as
  * auditable (either because of the AUDIT_RECORD_CONTEXT state from
@@ -1733,13 +1737,17 @@ void audit_finish_fork(struct task_struct *child)
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname().
  */
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
 {
 	struct task_struct *tsk = current;
 	struct audit_context *context;
 
-	context = audit_get_context(tsk, valid, return_code);
+	if (success)
+		success = AUDITSC_SUCCESS;
+	else
+		success = AUDITSC_FAILURE;
 
+	context = audit_get_context(tsk, success, return_code);
 	if (likely(!context))
 		return;
 
-- 
cgit v1.2.3-71-gd317


From b05d8447e7821695bc2fa3359431f7a664232743 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: inline audit_syscall_entry to reduce burden on archs

Every arch calls:

if (unlikely(current->audit_context))
	audit_syscall_entry()

which requires knowledge about audit (the existance of audit_context) in
the arch code.  Just do it all in static inline in audit.h so that arch's
can remain blissfully ignorant.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/ia64/kernel/ptrace.c       |  9 +--------
 arch/microblaze/kernel/ptrace.c |  6 ++----
 arch/mips/kernel/ptrace.c       |  7 +++----
 arch/powerpc/kernel/ptrace.c    | 26 ++++++++++++--------------
 arch/s390/kernel/ptrace.c       | 11 +++++------
 arch/sh/kernel/ptrace_32.c      |  7 +++----
 arch/sh/kernel/ptrace_64.c      |  7 +++----
 arch/sparc/kernel/ptrace_64.c   | 17 ++++++++---------
 arch/um/kernel/ptrace.c         | 20 +++++++++-----------
 arch/x86/ia32/ia32entry.S       |  2 +-
 arch/x86/kernel/entry_32.S      |  2 +-
 arch/x86/kernel/entry_64.S      |  4 ++--
 arch/x86/kernel/ptrace.c        | 22 ++++++++++------------
 arch/xtensa/kernel/ptrace.c     |  3 +--
 include/linux/audit.h           | 13 ++++++++++---
 kernel/auditsc.c                |  2 +-
 16 files changed, 72 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 2c154088cce7..dad91661ddf9 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1246,15 +1246,8 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
 	if (test_thread_flag(TIF_RESTORE_RSE))
 		ia64_sync_krbs();
 
-	if (unlikely(current->audit_context)) {
-		long syscall;
-		int arch;
 
-		syscall = regs.r15;
-		arch = AUDIT_ARCH_IA64;
-
-		audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3);
-	}
+	audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3);
 
 	return 0;
 }
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index f564b1bfd386..6eb2aa927d89 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -147,10 +147,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 		 */
 		ret = -1L;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(EM_MICROBLAZE, regs->r12,
-				    regs->r5, regs->r6,
-				    regs->r7, regs->r8);
+	audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6,
+			    regs->r7, regs->r8);
 
 	return ret ?: regs->r12;
 }
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index ab0f1963a7bd..7786b608d932 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -560,10 +560,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 	}
 
 out:
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[2],
-				    regs->regs[4], regs->regs[5],
-				    regs->regs[6], regs->regs[7]);
+	audit_syscall_entry(audit_arch(), regs->regs[2],
+			    regs->regs[4], regs->regs[5],
+			    regs->regs[6], regs->regs[7]);
 }
 
 /*
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 09d31c12a5e3..5b43325402bc 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1724,22 +1724,20 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->gpr[0]);
 
-	if (unlikely(current->audit_context)) {
 #ifdef CONFIG_PPC64
-		if (!is_32bit_task())
-			audit_syscall_entry(AUDIT_ARCH_PPC64,
-					    regs->gpr[0],
-					    regs->gpr[3], regs->gpr[4],
-					    regs->gpr[5], regs->gpr[6]);
-		else
+	if (!is_32bit_task())
+		audit_syscall_entry(AUDIT_ARCH_PPC64,
+				    regs->gpr[0],
+				    regs->gpr[3], regs->gpr[4],
+				    regs->gpr[5], regs->gpr[6]);
+	else
 #endif
-			audit_syscall_entry(AUDIT_ARCH_PPC,
-					    regs->gpr[0],
-					    regs->gpr[3] & 0xffffffff,
-					    regs->gpr[4] & 0xffffffff,
-					    regs->gpr[5] & 0xffffffff,
-					    regs->gpr[6] & 0xffffffff);
-	}
+		audit_syscall_entry(AUDIT_ARCH_PPC,
+				    regs->gpr[0],
+				    regs->gpr[3] & 0xffffffff,
+				    regs->gpr[4] & 0xffffffff,
+				    regs->gpr[5] & 0xffffffff,
+				    regs->gpr[6] & 0xffffffff);
 
 	return ret ?: regs->gpr[0];
 }
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index f52758600980..9d82ed4bcb27 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -740,12 +740,11 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->gprs[2]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(is_compat_task() ?
-					AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
-				    regs->gprs[2], regs->orig_gpr2,
-				    regs->gprs[3], regs->gprs[4],
-				    regs->gprs[5]);
+	audit_syscall_entry(is_compat_task() ?
+				AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
+			    regs->gprs[2], regs->orig_gpr2,
+			    regs->gprs[3], regs->gprs[4],
+			    regs->gprs[5]);
 	return ret ?: regs->gprs[2];
 }
 
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index c0b5c179d27b..a3e651563763 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -518,10 +518,9 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[0]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[3],
-				    regs->regs[4], regs->regs[5],
-				    regs->regs[6], regs->regs[7]);
+	audit_syscall_entry(audit_arch(), regs->regs[3],
+			    regs->regs[4], regs->regs[5],
+			    regs->regs[6], regs->regs[7]);
 
 	return ret ?: regs->regs[0];
 }
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index ba720d686435..3d0080b5c976 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -536,10 +536,9 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[9]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[1],
-				    regs->regs[2], regs->regs[3],
-				    regs->regs[4], regs->regs[5]);
+	audit_syscall_entry(audit_arch(), regs->regs[1],
+			    regs->regs[2], regs->regs[3],
+			    regs->regs[4], regs->regs[5]);
 
 	return ret ?: regs->regs[9];
 }
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index c73c8c50f117..9388844cd88c 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -1071,15 +1071,14 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->u_regs[UREG_G1]);
 
-	if (unlikely(current->audit_context) && !ret)
-		audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
-				     AUDIT_ARCH_SPARC :
-				     AUDIT_ARCH_SPARC64),
-				    regs->u_regs[UREG_G1],
-				    regs->u_regs[UREG_I0],
-				    regs->u_regs[UREG_I1],
-				    regs->u_regs[UREG_I2],
-				    regs->u_regs[UREG_I3]);
+	audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
+			     AUDIT_ARCH_SPARC :
+			     AUDIT_ARCH_SPARC64),
+			    regs->u_regs[UREG_G1],
+			    regs->u_regs[UREG_I0],
+			    regs->u_regs[UREG_I1],
+			    regs->u_regs[UREG_I2],
+			    regs->u_regs[UREG_I3]);
 
 	return ret;
 }
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 2ccf25c42feb..06b190390505 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -167,17 +167,15 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
 	int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
 	int tracesysgood;
 
-	if (unlikely(current->audit_context)) {
-		if (!entryexit)
-			audit_syscall_entry(HOST_AUDIT_ARCH,
-					    UPT_SYSCALL_NR(regs),
-					    UPT_SYSCALL_ARG1(regs),
-					    UPT_SYSCALL_ARG2(regs),
-					    UPT_SYSCALL_ARG3(regs),
-					    UPT_SYSCALL_ARG4(regs));
-		else
-			audit_syscall_exit(regs);
-	}
+	if (!entryexit)
+		audit_syscall_entry(HOST_AUDIT_ARCH,
+				    UPT_SYSCALL_NR(regs),
+				    UPT_SYSCALL_ARG1(regs),
+				    UPT_SYSCALL_ARG2(regs),
+				    UPT_SYSCALL_ARG3(regs),
+				    UPT_SYSCALL_ARG4(regs));
+	else
+		audit_syscall_exit(regs);
 
 	/* Fake a debug trap */
 	if (is_singlestep)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 025f0f01d254..cecfd9a8f734 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -192,7 +192,7 @@ sysexit_from_sys_call:
 	movl %ebx,%edx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%esi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%edi	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index a22facf06f0e..1ccd742eba1b 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -456,7 +456,7 @@ sysenter_audit:
 	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%edx			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	pushl_cfi %ebx
 	movl PT_EAX(%esp),%eax		/* reload syscall number */
 	jmp sysenter_do_call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e51393dd93a3..1ca66b650123 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -549,7 +549,7 @@ badsys:
 #ifdef CONFIG_AUDITSYSCALL
 	/*
 	 * Fast path for syscall audit without full syscall trace.
-	 * We just call audit_syscall_entry() directly, and then
+	 * We just call __audit_syscall_entry() directly, and then
 	 * jump back to the normal fast path.
 	 */
 auditsys:
@@ -559,7 +559,7 @@ auditsys:
 	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
 	movq %rax,%rsi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	LOAD_ARGS 0		/* reload call-clobbered registers */
 	jmp system_call_fastpath
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8b0218758775..50267386b766 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->orig_ax);
 
-	if (unlikely(current->audit_context)) {
-		if (IS_IA32)
-			audit_syscall_entry(AUDIT_ARCH_I386,
-					    regs->orig_ax,
-					    regs->bx, regs->cx,
-					    regs->dx, regs->si);
+	if (IS_IA32)
+		audit_syscall_entry(AUDIT_ARCH_I386,
+				    regs->orig_ax,
+				    regs->bx, regs->cx,
+				    regs->dx, regs->si);
 #ifdef CONFIG_X86_64
-		else
-			audit_syscall_entry(AUDIT_ARCH_X86_64,
-					    regs->orig_ax,
-					    regs->di, regs->si,
-					    regs->dx, regs->r10);
+	else
+		audit_syscall_entry(AUDIT_ARCH_X86_64,
+				    regs->orig_ax,
+				    regs->di, regs->si,
+				    regs->dx, regs->r10);
 #endif
-	}
 
 	return ret ?: regs->orig_ax;
 }
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index a0d042aa2967..2dff698ab02e 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -334,8 +334,7 @@ void do_syscall_trace_enter(struct pt_regs *regs)
 		do_syscall_trace();
 
 #if 0
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
+	audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
 #endif
 }
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 3d65e4b3ba06..f56ce2669b83 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -418,9 +418,9 @@ extern int audit_classify_arch(int arch);
 extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
 extern void audit_free(struct task_struct *task);
-extern void audit_syscall_entry(int arch,
-				int major, unsigned long a0, unsigned long a1,
-				unsigned long a2, unsigned long a3);
+extern void __audit_syscall_entry(int arch,
+				  int major, unsigned long a0, unsigned long a1,
+				  unsigned long a2, unsigned long a3);
 extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
@@ -435,6 +435,13 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
+static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
+				       unsigned long a1, unsigned long a2,
+				       unsigned long a3)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_syscall_entry(arch, major, a0, a1, a2, a3);
+}
 static inline void audit_syscall_exit(void *pt_regs)
 {
 	if (unlikely(current->audit_context)) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3d2853808185..b408100dd6ef 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1632,7 +1632,7 @@ void audit_free(struct task_struct *tsk)
  * will only be written if another part of the kernel requests that it
  * be written).
  */
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
 			 unsigned long a1, unsigned long a2,
 			 unsigned long a3, unsigned long a4)
 {
-- 
cgit v1.2.3-71-gd317


From 997f5b6444f4608692ec807fb802fd9767c80e76 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: remove AUDIT_SETUP_CONTEXT as it isn't used

Audit contexts have 3 states.  Disabled, which doesn't collect anything,
build, which collects info but might not emit it, and record, which
collects and emits.  There is a 4th state, setup, which isn't used.  Get
rid of it.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/audit.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index 91e7071c4d2c..816766803371 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -36,12 +36,8 @@ enum audit_state {
 	AUDIT_DISABLED,		/* Do not create per-task audit_context.
 				 * No syscall-specific audit records can
 				 * be generated. */
-	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
-				 * but don't necessarily fill it in at
-				 * syscall entry time (i.e., filter
-				 * instead). */
 	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
-				 * and always fill it in at syscall
+				 * and fill it in at syscall
 				 * entry time.  This makes a full
 				 * syscall record available if some
 				 * other part of the kernel decides it
-- 
cgit v1.2.3-71-gd317


From 56179a6ec65a56e0279a58e35cb450d38f061b94 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: drop some potentially inadvisable likely notations

The audit code makes heavy use of likely() and unlikely() macros, but they
don't always make sense.  Drop any that seem questionable and let the
computer do it's thing.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditsc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b408100dd6ef..d7382c2aaa9e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -805,7 +805,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 {
 	struct audit_context *context = tsk->audit_context;
 
-	if (likely(!context))
+	if (!context)
 		return NULL;
 	context->return_valid = return_valid;
 
@@ -928,7 +928,7 @@ int audit_alloc(struct task_struct *tsk)
 		return 0; /* Return if not auditing. */
 
 	state = audit_filter_task(tsk, &key);
-	if (likely(state == AUDIT_DISABLED))
+	if (state == AUDIT_DISABLED)
 		return 0;
 
 	if (!(context = audit_alloc_context(state))) {
@@ -1599,7 +1599,7 @@ void audit_free(struct task_struct *tsk)
 	struct audit_context *context;
 
 	context = audit_get_context(tsk, 0, 0);
-	if (likely(!context))
+	if (!context)
 		return;
 
 	/* Check for system calls that do not go through the exit
@@ -1640,7 +1640,7 @@ void __audit_syscall_entry(int arch, int major,
 	struct audit_context *context = tsk->audit_context;
 	enum audit_state     state;
 
-	if (unlikely(!context))
+	if (!context)
 		return;
 
 	/*
@@ -1697,7 +1697,7 @@ void __audit_syscall_entry(int arch, int major,
 		context->prio = 0;
 		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
 	}
-	if (likely(state == AUDIT_DISABLED))
+	if (state == AUDIT_DISABLED)
 		return;
 
 	context->serial     = 0;
@@ -1748,7 +1748,7 @@ void __audit_syscall_exit(int success, long return_code)
 		success = AUDITSC_FAILURE;
 
 	context = audit_get_context(tsk, success, return_code);
-	if (likely(!context))
+	if (!context)
 		return;
 
 	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
-- 
cgit v1.2.3-71-gd317


From 07c49417877f8658a6aa0ad9b4e21e4fd4df11b6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: inline checks for not needing to collect aux records

A number of audit hooks make function calls before they determine that
auxilary records do not need to be collected.  Do those checks as static
inlines since the most common case is going to be that records are not
needed and we can skip the function call overhead.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 23 ++++++++++++++++++++---
 kernel/auditsc.c      | 15 +++------------
 2 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index f56ce2669b83..cf16faff6b8a 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -489,9 +489,9 @@ extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern void audit_log_task_context(struct audit_buffer *ab);
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
-extern int audit_bprm(struct linux_binprm *bprm);
-extern void audit_socketcall(int nargs, unsigned long *args);
-extern int audit_sockaddr(int len, void *addr);
+extern int __audit_bprm(struct linux_binprm *bprm);
+extern void __audit_socketcall(int nargs, unsigned long *args);
+extern int __audit_sockaddr(int len, void *addr);
 extern void __audit_fd_pair(int fd1, int fd2);
 extern int audit_set_macxattr(const char *name);
 extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
@@ -519,6 +519,23 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid
 	if (unlikely(!audit_dummy_context()))
 		__audit_ipc_set_perm(qbytes, uid, gid, mode);
 }
+static inline int audit_bprm(struct linux_binprm *bprm)
+{
+	if (unlikely(!audit_dummy_context()))
+		return __audit_bprm(bprm);
+	return 0;
+}
+static inline void audit_socketcall(int nargs, unsigned long *args)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_socketcall(nargs, args);
+}
+static inline int audit_sockaddr(int len, void *addr)
+{
+	if (unlikely(!audit_dummy_context()))
+		return __audit_sockaddr(len, addr);
+	return 0;
+}
 static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
 	if (unlikely(!audit_dummy_context()))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d7382c2aaa9e..e1062f66b01b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2309,14 +2309,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
 	context->ipc.has_perm = 1;
 }
 
-int audit_bprm(struct linux_binprm *bprm)
+int __audit_bprm(struct linux_binprm *bprm)
 {
 	struct audit_aux_data_execve *ax;
 	struct audit_context *context = current->audit_context;
 
-	if (likely(!audit_enabled || !context || context->dummy))
-		return 0;
-
 	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
 	if (!ax)
 		return -ENOMEM;
@@ -2337,13 +2334,10 @@ int audit_bprm(struct linux_binprm *bprm)
  * @args: args array
  *
  */
-void audit_socketcall(int nargs, unsigned long *args)
+void __audit_socketcall(int nargs, unsigned long *args)
 {
 	struct audit_context *context = current->audit_context;
 
-	if (likely(!context || context->dummy))
-		return;
-
 	context->type = AUDIT_SOCKETCALL;
 	context->socketcall.nargs = nargs;
 	memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2369,13 +2363,10 @@ void __audit_fd_pair(int fd1, int fd2)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_sockaddr(int len, void *a)
+int __audit_sockaddr(int len, void *a)
 {
 	struct audit_context *context = current->audit_context;
 
-	if (likely(!context || context->dummy))
-		return 0;
-
 	if (!context->sockaddr) {
 		void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
 		if (!p)
-- 
cgit v1.2.3-71-gd317


From a4ff8dba7d8ce5ceb43fb27df66292251cc73bdc Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: inline audit_free to simplify the look of generic code

make the conditional a static inline instead of doing it in generic code.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 7 ++++++-
 kernel/auditsc.c      | 2 +-
 kernel/exit.c         | 3 +--
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 4f1efe3e8616..8eb8bda749b3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -417,7 +417,7 @@ extern int audit_classify_arch(int arch);
 				/* Public API */
 extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
-extern void audit_free(struct task_struct *task);
+extern void __audit_free(struct task_struct *task);
 extern void __audit_syscall_entry(int arch,
 				  int major, unsigned long a0, unsigned long a1,
 				  unsigned long a2, unsigned long a3);
@@ -435,6 +435,11 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
+static inline void audit_free(struct task_struct *task)
+{
+	if (unlikely(task->audit_context))
+		__audit_free(task);
+}
 static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
 				       unsigned long a1, unsigned long a2,
 				       unsigned long a3)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e1062f66b01b..7aaeb38b262a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1594,7 +1594,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
  *
  * Called from copy_process and do_exit
  */
-void audit_free(struct task_struct *tsk)
+void __audit_free(struct task_struct *tsk)
 {
 	struct audit_context *context;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 94ed6e20bb53..88dcbbc446f7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -964,8 +964,7 @@ NORET_TYPE void do_exit(long code)
 	acct_collect(code, group_dead);
 	if (group_dead)
 		tty_audit_exit();
-	if (unlikely(tsk->audit_context))
-		audit_free(tsk);
+	audit_free(tsk);
 
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
-- 
cgit v1.2.3-71-gd317


From 7ff68e53ece8c175d2951bb8a30b3cce8f9c5579 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: reject entry,always rules

We deprecated entry,always rules a long time ago.  Reject those rules as
invalid.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditfilter.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d94dde82c3c8..903caa269b5c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 	switch(listnr) {
 	default:
 		goto exit_err;
-	case AUDIT_FILTER_USER:
-	case AUDIT_FILTER_TYPE:
 #ifdef CONFIG_AUDITSYSCALL
 	case AUDIT_FILTER_ENTRY:
+		if (rule->action == AUDIT_ALWAYS)
+			goto exit_err;
 	case AUDIT_FILTER_EXIT:
 	case AUDIT_FILTER_TASK:
 #endif
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
 		;
 	}
 	if (unlikely(rule->action == AUDIT_POSSIBLE)) {
-- 
cgit v1.2.3-71-gd317


From 6422e78de6880c66a82af512d9bd0c85eb62e661 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: remove audit_finish_fork as it can't be called

Audit entry,always rules are not allowed and are automatically changed in
exit,always rules in userspace.  The kernel refuses to load such rules.

Thus a task in the middle of a syscall (and thus in audit_finish_fork())
can only be in one of two states: AUDIT_BUILD_CONTEXT or AUDIT_DISABLED.
Since the current task cannot be in AUDIT_RECORD_CONTEXT we aren't every
going to actually use the code in audit_finish_fork() since it will
return without doing anything.  Thus drop the code.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  2 --
 kernel/auditsc.c      | 20 --------------------
 kernel/fork.c         |  2 --
 3 files changed, 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8eb8bda749b3..67b66c37a254 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -415,7 +415,6 @@ extern int audit_classify_arch(int arch);
 #ifdef CONFIG_AUDITSYSCALL
 /* These are defined in auditsc.c */
 				/* Public API */
-extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
 extern void __audit_free(struct task_struct *task);
 extern void __audit_syscall_entry(int arch,
@@ -586,7 +585,6 @@ static inline void audit_mmap_fd(int fd, int flags)
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
-#define audit_finish_fork(t)
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7aaeb38b262a..4d8920f5ab88 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1707,26 +1707,6 @@ void __audit_syscall_entry(int arch, int major,
 	context->ppid       = 0;
 }
 
-void audit_finish_fork(struct task_struct *child)
-{
-	struct audit_context *ctx = current->audit_context;
-	struct audit_context *p = child->audit_context;
-	if (!p || !ctx)
-		return;
-	if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
-		return;
-	p->arch = ctx->arch;
-	p->major = ctx->major;
-	memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
-	p->ctime = ctx->ctime;
-	p->dummy = ctx->dummy;
-	p->in_syscall = ctx->in_syscall;
-	p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
-	p->ppid = current->pid;
-	p->prio = ctx->prio;
-	p->current_state = ctx->current_state;
-}
-
 /**
  * audit_syscall_exit - deallocate audit context after a system call
  * @pt_regs: syscall registers
diff --git a/kernel/fork.c b/kernel/fork.c
index 443f5125f11e..c1e5c21f48c1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1525,8 +1525,6 @@ long do_fork(unsigned long clone_flags,
 			init_completion(&vfork);
 		}
 
-		audit_finish_fork(p);
-
 		/*
 		 * We set PF_STARTING at creation in case tracing wants to
 		 * use this to distinguish a fully live task from one that
-- 
cgit v1.2.3-71-gd317


From efaffd6e4417860c67576ac760dd6e8bbd15f006 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: allow matching on obj_uid

Allow syscall exit filter matching based on the uid of the owner of an
inode used in a syscall.  aka:

auditctl -a always,exit -S open -F obj_uid=0 -F perm=wa

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  1 +
 kernel/auditfilter.c  |  1 +
 kernel/auditsc.c      | 12 ++++++++++++
 3 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 67b66c37a254..55cb3daaf474 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -223,6 +223,7 @@
 #define AUDIT_PERM	106
 #define AUDIT_DIR	107
 #define AUDIT_FILETYPE	108
+#define AUDIT_OBJ_UID	109
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 903caa269b5c..13e997423dcd 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -461,6 +461,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_ARG1:
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
+		case AUDIT_OBJ_UID:
 			break;
 		case AUDIT_ARCH:
 			entry->rule.arch_f = f;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4d8920f5ab88..5cf3ecc01517 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -586,6 +586,18 @@ static int audit_filter_rules(struct task_struct *tsk,
 				}
 			}
 			break;
+		case AUDIT_OBJ_UID:
+			if (name) {
+				result = audit_comparator(name->uid, f->op, f->val);
+			} else if (ctx) {
+				list_for_each_entry(n, &ctx->names_list, list) {
+					if (audit_comparator(n->uid, f->op, f->val)) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
 		case AUDIT_WATCH:
 			if (name)
 				result = audit_watch_compare(rule->watch, name->ino, name->dev);
-- 
cgit v1.2.3-71-gd317


From 54d3218b31aee5bc9c859ae60fbde933d922448b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:07 -0500
Subject: audit: allow audit matching on inode gid

Much like the ability to filter audit on the uid of an inode collected, we
should be able to filter on the gid of the inode.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  1 +
 kernel/auditfilter.c  |  1 +
 kernel/auditsc.c      | 12 ++++++++++++
 3 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 55cb3daaf474..e36aa37c88af 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -224,6 +224,7 @@
 #define AUDIT_DIR	107
 #define AUDIT_FILETYPE	108
 #define AUDIT_OBJ_UID	109
+#define AUDIT_OBJ_GID	110
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 13e997423dcd..f10605c787e6 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -462,6 +462,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
 		case AUDIT_OBJ_UID:
+		case AUDIT_OBJ_GID:
 			break;
 		case AUDIT_ARCH:
 			entry->rule.arch_f = f;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 5cf3ecc01517..87b375fb12ff 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -598,6 +598,18 @@ static int audit_filter_rules(struct task_struct *tsk,
 				}
 			}
 			break;
+		case AUDIT_OBJ_GID:
+			if (name) {
+				result = audit_comparator(name->gid, f->op, f->val);
+			} else if (ctx) {
+				list_for_each_entry(n, &ctx->names_list, list) {
+					if (audit_comparator(n->gid, f->op, f->val)) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
 		case AUDIT_WATCH:
 			if (name)
 				result = audit_watch_compare(rule->watch, name->ino, name->dev);
-- 
cgit v1.2.3-71-gd317


From 0a300be6d5be8f66cd96609334710c268d0bfdce Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: remove task argument to audit_set_loginuid

The function always deals with current.  Don't expose an option
pretending one can use it for something.  You can't.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/proc/base.c        | 2 +-
 include/linux/audit.h | 2 +-
 kernel/auditsc.c      | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8173dfd89cb2..e3cbebbabebd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1228,7 +1228,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	length = audit_set_loginuid(current, loginuid);
+	length = audit_set_loginuid(loginuid);
 	if (likely(length == 0))
 		length = count;
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index e36aa37c88af..7cbd6fe41573 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -489,7 +489,7 @@ static inline void audit_ptrace(struct task_struct *t)
 extern unsigned int audit_serial(void);
 extern int auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
-extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
+extern int  audit_set_loginuid(uid_t loginuid);
 #define audit_get_loginuid(t) ((t)->loginuid)
 #define audit_get_sessionid(t) ((t)->sessionid)
 extern void audit_log_task_context(struct audit_buffer *ab);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 87b375fb12ff..9d6dd7d869c0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2163,16 +2163,16 @@ int auditsc_get_stamp(struct audit_context *ctx,
 static atomic_t session_id = ATOMIC_INIT(0);
 
 /**
- * audit_set_loginuid - set a task's audit_context loginuid
- * @task: task whose audit context is being modified
+ * audit_set_loginuid - set current task's audit_context loginuid
  * @loginuid: loginuid value
  *
  * Returns 0.
  *
  * Called (set) from fs/proc/base.c::proc_loginuid_write().
  */
-int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+int audit_set_loginuid(uid_t loginuid)
 {
+	struct task_struct *task = current;
 	unsigned int sessionid = atomic_inc_return(&session_id);
 	struct audit_context *context = task->audit_context;
 
-- 
cgit v1.2.3-71-gd317


From 633b45454503489209b0d9a45f9e3cd1b852c614 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: only allow tasks to set their loginuid if it is -1

At the moment we allow tasks to set their loginuid if they have
CAP_AUDIT_CONTROL.  In reality we want tasks to set the loginuid when they
log in and it be impossible to ever reset.  We had to make it mutable even
after it was once set (with the CAP) because on update and admin might have
to restart sshd.  Now sshd would get his loginuid and the next user which
logged in using ssh would not be able to set his loginuid.

Systemd has changed how userspace works and allowed us to make the kernel
work the way it should.  With systemd users (even admins) are not supposed
to restart services directly.  The system will restart the service for
them.  Thus since systemd is going to loginuid==-1, sshd would get -1, and
sshd would be allowed to set a new loginuid without special permissions.

If an admin in this system were to manually start an sshd he is inserting
himself into the system chain of trust and thus, logically, it's his
loginuid that should be used!  Since we have old systems I make this a
Kconfig option.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/proc/base.c   |  3 ---
 init/Kconfig     | 14 ++++++++++++++
 kernel/auditsc.c | 11 ++++++++++-
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e3cbebbabebd..482df23036b5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1197,9 +1197,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	ssize_t length;
 	uid_t loginuid;
 
-	if (!capable(CAP_AUDIT_CONTROL))
-		return -EPERM;
-
 	rcu_read_lock();
 	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
 		rcu_read_unlock();
diff --git a/init/Kconfig b/init/Kconfig
index a075765d5fbe..5ad8b775f2ac 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -372,6 +372,20 @@ config AUDIT_TREE
 	depends on AUDITSYSCALL
 	select FSNOTIFY
 
+config AUDIT_LOGINUID_IMMUTABLE
+	bool "Make audit loginuid immutable"
+	depends on AUDIT
+	help
+	  The config option toggles if a task setting it's loginuid requires
+	  CAP_SYS_AUDITCONTROL or if that task should require no special permissions
+	  but should instead only allow setting its loginuid if it was never
+	  previously set.  On systems which use systemd or a similar central
+	  process to restart login services this should be set to true.  On older
+	  systems in which an admin would typically have to directly stop and
+	  start processes this should be set to false.  Setting this to true allows
+	  one to drop potentially dangerous capabilites from the login tasks,
+	  but may not be backwards compatible with older init systems.
+
 source "kernel/irq/Kconfig"
 
 menu "RCU Subsystem"
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9d6dd7d869c0..bd084a13c719 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2173,9 +2173,18 @@ static atomic_t session_id = ATOMIC_INIT(0);
 int audit_set_loginuid(uid_t loginuid)
 {
 	struct task_struct *task = current;
-	unsigned int sessionid = atomic_inc_return(&session_id);
 	struct audit_context *context = task->audit_context;
+	unsigned int sessionid;
+
+#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
+	if (task->loginuid != -1)
+		return -EPERM;
+#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+	if (!capable(CAP_AUDIT_CONTROL))
+		return -EPERM;
+#endif  /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
 
+	sessionid = atomic_inc_return(&session_id);
 	if (context && context->in_syscall) {
 		struct audit_buffer *ab;
 
-- 
cgit v1.2.3-71-gd317


From 4043cde8ecf7f7d880eb1133c201a3d392fd68c3 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: do not call audit_getname on error

Just a code cleanup really.  We don't need to make a function call just for
it to return on error.  This also makes the VFS function even easier to follow
and removes a conditional on a hot path.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/namei.c       | 28 +++++++++++++---------------
 kernel/auditsc.c |  3 ---
 2 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/fs/namei.c b/fs/namei.c
index c283a1ec008e..208c6aa4a989 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
 
 static char *getname_flags(const char __user *filename, int flags, int *empty)
 {
-	char *tmp, *result;
-
-	result = ERR_PTR(-ENOMEM);
-	tmp = __getname();
-	if (tmp)  {
-		int retval = do_getname(filename, tmp);
-
-		result = tmp;
-		if (retval < 0) {
-			if (retval == -ENOENT && empty)
-				*empty = 1;
-			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-				__putname(tmp);
-				result = ERR_PTR(retval);
-			}
+	char *result = __getname();
+	int retval;
+
+	if (!result)
+		return ERR_PTR(-ENOMEM);
+
+	retval = do_getname(filename, result);
+	if (retval < 0) {
+		if (retval == -ENOENT && empty)
+			*empty = 1;
+		if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+			__putname(result);
+			return ERR_PTR(retval);
 		}
 	}
 	audit_getname(result);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bd084a13c719..9161e70a4379 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1913,9 +1913,6 @@ void __audit_getname(const char *name)
 	struct audit_context *context = current->audit_context;
 	struct audit_names *n;
 
-	if (IS_ERR(name) || !name)
-		return;
-
 	if (!context->in_syscall) {
 #if AUDIT_DEBUG == 2
 		printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
-- 
cgit v1.2.3-71-gd317


From 02d86a568c6d2d335256864451ac8ce781bc5652 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: allow interfield comparison in audit rules

We wish to be able to audit when a uid=500 task accesses a file which is
uid=0.  Or vice versa.  This patch introduces a new audit filter type
AUDIT_FIELD_COMPARE which takes as an 'enum' which indicates which fields
should be compared.  At this point we only define the task->uid vs
inode->uid, but other comparisons can be added.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  4 ++++
 kernel/auditfilter.c  |  5 ++++-
 kernel/auditsc.c      | 30 +++++++++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 7cbd6fe41573..838e05fc0582 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -182,7 +182,10 @@
  * AUDIT_UNUSED_BITS is updated if need be. */
 #define AUDIT_UNUSED_BITS	0x07FFFC00
 
+/* AUDIT_FIELD_COMPARE rule list */
+#define AUDIT_COMPARE_UID_TO_OBJ_UID	1
 
+#define AUDIT_MAX_FIELD_COMPARE	AUDIT_COMPARE_UID_TO_OBJ_UID
 /* Rule fields */
 				/* These are useful when checking the
 				 * task structure at task creation time
@@ -225,6 +228,7 @@
 #define AUDIT_FILETYPE	108
 #define AUDIT_OBJ_UID	109
 #define AUDIT_OBJ_GID	110
+#define AUDIT_FIELD_COMPARE	111
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f10605c787e6..a6c3f1abd206 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -526,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 				goto exit_free;
 			break;
 		case AUDIT_FILTERKEY:
-			err = -EINVAL;
 			if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
 				goto exit_free;
 			str = audit_unpack_string(&bufp, &remain, f->val);
@@ -543,6 +542,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			if (f->val & ~S_IFMT)
 				goto exit_free;
 			break;
+		case AUDIT_FIELD_COMPARE:
+			if (f->val > AUDIT_MAX_FIELD_COMPARE)
+				goto exit_free;
+			break;
 		default:
 			goto exit_free;
 		}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9161e70a4379..8fb2c8e6d624 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -463,6 +463,32 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
 	return 0;
 }
 
+static int audit_field_compare(struct task_struct *tsk,
+			       const struct cred *cred,
+			       struct audit_field *f,
+			       struct audit_context *ctx,
+			       struct audit_names *name)
+{
+	struct audit_names *n;
+
+	switch (f->val) {
+	case AUDIT_COMPARE_UID_TO_OBJ_UID:
+		if (name) {
+			return audit_comparator(cred->uid, f->op, name->uid);
+		} else if (ctx) {
+			list_for_each_entry(n, &ctx->names_list, list) {
+				if (audit_comparator(cred->uid, f->op, n->uid))
+					return 1;
+			}
+		}
+		break;
+	default:
+		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
+		return 0;
+	}
+	return 0;
+}
+
 /* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise.
@@ -693,8 +719,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_FILETYPE:
 			result = audit_match_filetype(ctx, f->val);
 			break;
+		case AUDIT_FIELD_COMPARE:
+			result = audit_field_compare(tsk, cred, f, ctx, name);
+			break;
 		}
-
 		if (!result)
 			return 0;
 	}
-- 
cgit v1.2.3-71-gd317


From b34b039324bf081554ee8678f9b8c5d937e5206c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: complex interfield comparison helper

Rather than code the same loop over and over implement a helper function which
uses some pointer magic to make it generic enough to be used numerous places
as we implement more audit interfield comparisons

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditsc.c | 50 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8fb2c8e6d624..b12cc32fe377 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -463,25 +463,53 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
 	return 0;
 }
 
+static int audit_compare_id(uid_t uid1,
+			    struct audit_names *name,
+			    unsigned long name_offset,
+			    struct audit_field *f,
+			    struct audit_context *ctx)
+{
+	struct audit_names *n;
+	unsigned long addr;
+	uid_t uid2;
+	int rc;
+
+	if (name) {
+		addr = (unsigned long)name;
+		addr += name_offset;
+
+		uid2 = *(uid_t *)addr;
+		rc = audit_comparator(uid1, f->op, uid2);
+		if (rc)
+			return rc;
+	}
+
+	if (ctx) {
+		list_for_each_entry(n, &ctx->names_list, list) {
+			addr = (unsigned long)n;
+			addr += name_offset;
+
+			uid2 = *(uid_t *)addr;
+
+			rc = audit_comparator(uid1, f->op, uid2);
+			if (rc)
+				return rc;
+		}
+	}
+	return 0;
+}
+
 static int audit_field_compare(struct task_struct *tsk,
 			       const struct cred *cred,
 			       struct audit_field *f,
 			       struct audit_context *ctx,
 			       struct audit_names *name)
 {
-	struct audit_names *n;
-
 	switch (f->val) {
 	case AUDIT_COMPARE_UID_TO_OBJ_UID:
-		if (name) {
-			return audit_comparator(cred->uid, f->op, name->uid);
-		} else if (ctx) {
-			list_for_each_entry(n, &ctx->names_list, list) {
-				if (audit_comparator(cred->uid, f->op, n->uid))
-					return 1;
-			}
-		}
-		break;
+		return audit_compare_id(cred->uid,
+					name, offsetof(struct audit_names, uid),
+					f, ctx);
 	default:
 		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
 		return 0;
-- 
cgit v1.2.3-71-gd317


From c9fe685f7a17a0ee8bf3fbe51e40b1c8b8e65896 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:08 -0500
Subject: audit: allow interfield comparison between gid and ogid

Allow audit rules to compare the gid of the running task to the gid of the
inode in question.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 3 ++-
 kernel/auditsc.c      | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 838e05fc0582..fffbc2176ee1 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -184,8 +184,9 @@
 
 /* AUDIT_FIELD_COMPARE rule list */
 #define AUDIT_COMPARE_UID_TO_OBJ_UID	1
+#define AUDIT_COMPARE_GID_TO_OBJ_GID	2
 
-#define AUDIT_MAX_FIELD_COMPARE	AUDIT_COMPARE_UID_TO_OBJ_UID
+#define AUDIT_MAX_FIELD_COMPARE	AUDIT_COMPARE_GID_TO_OBJ_GID
 /* Rule fields */
 				/* These are useful when checking the
 				 * task structure at task creation time
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b12cc32fe377..861c7b9c565a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -474,6 +474,8 @@ static int audit_compare_id(uid_t uid1,
 	uid_t uid2;
 	int rc;
 
+	BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
+
 	if (name) {
 		addr = (unsigned long)name;
 		addr += name_offset;
@@ -510,6 +512,10 @@ static int audit_field_compare(struct task_struct *tsk,
 		return audit_compare_id(cred->uid,
 					name, offsetof(struct audit_names, uid),
 					f, ctx);
+	case AUDIT_COMPARE_GID_TO_OBJ_GID:
+		return audit_compare_id(cred->gid,
+					name, offsetof(struct audit_names, gid),
+					f, ctx);
 	default:
 		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
 		return 0;
-- 
cgit v1.2.3-71-gd317


From 4a6633ed08af5ba67790b4d1adcdeb8ceb55677e Mon Sep 17 00:00:00 2001
From: Peter Moody <pmoody@google.com>
Date: Tue, 13 Dec 2011 16:17:51 -0800
Subject: audit: implement all object interfield comparisons

This completes the matrix of interfield comparisons between uid/gid
information for the current task and the uid/gid information for inodes.
aka I can audit based on differences between the euid of the process and
the uid of fs objects.

Signed-off-by: Peter Moody <pmoody@google.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 10 +++++++++-
 kernel/auditsc.c      | 29 +++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index fffbc2176ee1..67113cb4bc15 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -185,8 +185,16 @@
 /* AUDIT_FIELD_COMPARE rule list */
 #define AUDIT_COMPARE_UID_TO_OBJ_UID	1
 #define AUDIT_COMPARE_GID_TO_OBJ_GID	2
+#define AUDIT_COMPARE_EUID_TO_OBJ_UID	3
+#define AUDIT_COMPARE_EGID_TO_OBJ_GID	4
+#define AUDIT_COMPARE_AUID_TO_OBJ_UID	5
+#define AUDIT_COMPARE_SUID_TO_OBJ_UID	6
+#define AUDIT_COMPARE_SGID_TO_OBJ_GID	7
+#define AUDIT_COMPARE_FSUID_TO_OBJ_UID	8
+#define AUDIT_COMPARE_FSGID_TO_OBJ_GID	9
+
+#define AUDIT_MAX_FIELD_COMPARE		AUDIT_COMPARE_FSGID_TO_OBJ_GID
 
-#define AUDIT_MAX_FIELD_COMPARE	AUDIT_COMPARE_GID_TO_OBJ_GID
 /* Rule fields */
 				/* These are useful when checking the
 				 * task structure at task creation time
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 861c7b9c565a..b8cee462b99e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -508,6 +508,7 @@ static int audit_field_compare(struct task_struct *tsk,
 			       struct audit_names *name)
 {
 	switch (f->val) {
+	/* process to file object comparisons */
 	case AUDIT_COMPARE_UID_TO_OBJ_UID:
 		return audit_compare_id(cred->uid,
 					name, offsetof(struct audit_names, uid),
@@ -516,6 +517,34 @@ static int audit_field_compare(struct task_struct *tsk,
 		return audit_compare_id(cred->gid,
 					name, offsetof(struct audit_names, gid),
 					f, ctx);
+	case AUDIT_COMPARE_EUID_TO_OBJ_UID:
+		return audit_compare_id(cred->euid,
+					name, offsetof(struct audit_names, uid),
+					f, ctx);
+	case AUDIT_COMPARE_EGID_TO_OBJ_GID:
+		return audit_compare_id(cred->egid,
+					name, offsetof(struct audit_names, gid),
+					f, ctx);
+	case AUDIT_COMPARE_AUID_TO_OBJ_UID:
+		return audit_compare_id(tsk->loginuid,
+					name, offsetof(struct audit_names, uid),
+					f, ctx);
+	case AUDIT_COMPARE_SUID_TO_OBJ_UID:
+		return audit_compare_id(cred->suid,
+					name, offsetof(struct audit_names, uid),
+					f, ctx);
+	case AUDIT_COMPARE_SGID_TO_OBJ_GID:
+		return audit_compare_id(cred->sgid,
+					name, offsetof(struct audit_names, gid),
+					f, ctx);
+	case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
+		return audit_compare_id(cred->fsuid,
+					name, offsetof(struct audit_names, uid),
+					f, ctx);
+	case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
+		return audit_compare_id(cred->fsgid,
+					name, offsetof(struct audit_names, gid),
+					f, ctx);
 	default:
 		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
 		return 0;
-- 
cgit v1.2.3-71-gd317


From 10d68360871657204885371cdf2594412675d2f9 Mon Sep 17 00:00:00 2001
From: Peter Moody <pmoody@google.com>
Date: Wed, 4 Jan 2012 15:24:31 -0500
Subject: audit: comparison on interprocess fields

This allows audit to specify rules in which we compare two fields of a
process.  Such as is the running process uid != to the running process
euid?

Signed-off-by: Peter Moody <pmoody@google.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 24 +++++++++++++++++++++++-
 kernel/auditsc.c      | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 67113cb4bc15..9ff7a2c48b50 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -193,7 +193,29 @@
 #define AUDIT_COMPARE_FSUID_TO_OBJ_UID	8
 #define AUDIT_COMPARE_FSGID_TO_OBJ_GID	9
 
-#define AUDIT_MAX_FIELD_COMPARE		AUDIT_COMPARE_FSGID_TO_OBJ_GID
+#define AUDIT_COMPARE_UID_TO_AUID	10
+#define AUDIT_COMPARE_UID_TO_EUID	11
+#define AUDIT_COMPARE_UID_TO_FSUID	12
+#define AUDIT_COMPARE_UID_TO_SUID	13
+
+#define AUDIT_COMPARE_AUID_TO_FSUID	14
+#define AUDIT_COMPARE_AUID_TO_SUID	15
+#define AUDIT_COMPARE_AUID_TO_EUID	16
+
+#define AUDIT_COMPARE_EUID_TO_SUID	17
+#define AUDIT_COMPARE_EUID_TO_FSUID	18
+
+#define AUDIT_COMPARE_SUID_TO_FSUID	19
+
+#define AUDIT_COMPARE_GID_TO_EGID	20
+#define AUDIT_COMPARE_GID_TO_FSGID	21
+#define AUDIT_COMPARE_GID_TO_SGID	22
+
+#define AUDIT_COMPARE_EGID_TO_FSGID	23
+#define AUDIT_COMPARE_EGID_TO_SGID	24
+#define AUDIT_COMPARE_SGID_TO_FSGID	25
+
+#define AUDIT_MAX_FIELD_COMPARE		AUDIT_COMPARE_SGID_TO_FSGID
 
 /* Rule fields */
 				/* These are useful when checking the
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b8cee462b99e..593237e3654d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -545,6 +545,45 @@ static int audit_field_compare(struct task_struct *tsk,
 		return audit_compare_id(cred->fsgid,
 					name, offsetof(struct audit_names, gid),
 					f, ctx);
+	/* uid comparisons */
+	case AUDIT_COMPARE_UID_TO_AUID:
+		return audit_comparator(cred->uid, f->op, tsk->loginuid);
+	case AUDIT_COMPARE_UID_TO_EUID:
+		return audit_comparator(cred->uid, f->op, cred->euid);
+	case AUDIT_COMPARE_UID_TO_SUID:
+		return audit_comparator(cred->uid, f->op, cred->suid);
+	case AUDIT_COMPARE_UID_TO_FSUID:
+		return audit_comparator(cred->uid, f->op, cred->fsuid);
+	/* auid comparisons */
+	case AUDIT_COMPARE_AUID_TO_EUID:
+		return audit_comparator(tsk->loginuid, f->op, cred->euid);
+	case AUDIT_COMPARE_AUID_TO_SUID:
+		return audit_comparator(tsk->loginuid, f->op, cred->suid);
+	case AUDIT_COMPARE_AUID_TO_FSUID:
+		return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+	/* euid comparisons */
+	case AUDIT_COMPARE_EUID_TO_SUID:
+		return audit_comparator(cred->euid, f->op, cred->suid);
+	case AUDIT_COMPARE_EUID_TO_FSUID:
+		return audit_comparator(cred->euid, f->op, cred->fsuid);
+	/* suid comparisons */
+	case AUDIT_COMPARE_SUID_TO_FSUID:
+		return audit_comparator(cred->suid, f->op, cred->fsuid);
+	/* gid comparisons */
+	case AUDIT_COMPARE_GID_TO_EGID:
+		return audit_comparator(cred->gid, f->op, cred->egid);
+	case AUDIT_COMPARE_GID_TO_SGID:
+		return audit_comparator(cred->gid, f->op, cred->sgid);
+	case AUDIT_COMPARE_GID_TO_FSGID:
+		return audit_comparator(cred->gid, f->op, cred->fsgid);
+	/* egid comparisons */
+	case AUDIT_COMPARE_EGID_TO_SGID:
+		return audit_comparator(cred->egid, f->op, cred->sgid);
+	case AUDIT_COMPARE_EGID_TO_FSGID:
+		return audit_comparator(cred->egid, f->op, cred->fsgid);
+	/* sgid comparison */
+	case AUDIT_COMPARE_SGID_TO_FSGID:
+		return audit_comparator(cred->sgid, f->op, cred->fsgid);
 	default:
 		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
 		return 0;
-- 
cgit v1.2.3-71-gd317


From 5afb8a3f96573f7ea018abb768f5b6ebe1a6c1a4 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Tue, 20 Dec 2011 18:39:41 -0500
Subject: audit: fix signedness bug in audit_log_execve_info()

In the loop, a size_t "len" is used to hold the return value of
audit_log_single_execve_arg(), which returns -1 on error.  In that
case the error handling (len <= 0) will be bypassed since "len" is
unsigned, and the loop continues with (p += len) being wrapped.
Change the type of "len" to signed int to fix the error handling.

	size_t len;
	...
	for (...) {
		len = audit_log_single_execve_arg(...);
		if (len <= 0)
			break;
		p += len;
	}

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditsc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 593237e3654d..86584ecb1039 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1362,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context,
 				  struct audit_buffer **ab,
 				  struct audit_aux_data_execve *axi)
 {
-	int i;
-	size_t len, len_sent = 0;
+	int i, len;
+	size_t len_sent = 0;
 	const char __user *p;
 	char *buf;
 
-- 
cgit v1.2.3-71-gd317


From c158a35c8a681cf68d36f22f058f9f5466386c71 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 6 Jan 2012 14:07:10 -0800
Subject: audit: no leading space in audit_log_d_path prefix

audit_log_d_path() injects an additional space before the prefix,
which serves no purpose and doesn't mix well with other audit_log*()
functions that do not sneak extra characters into the log.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/audit.c       | 2 +-
 kernel/auditsc.c     | 6 +++---
 security/lsm_audit.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 00efe4758c86..705c25a70bff 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	char *p, *pathname;
 
 	if (prefix)
-		audit_log_format(ab, " %s", prefix);
+		audit_log_format(ab, "%s", prefix);
 
 	/* We will allow 11 spaces for ' (deleted)' to be appended */
 	pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 86584ecb1039..caaea6e944f8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1171,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
 		while (vma) {
 			if ((vma->vm_flags & VM_EXECUTABLE) &&
 			    vma->vm_file) {
-				audit_log_d_path(ab, "exe=",
+				audit_log_d_path(ab, " exe=",
 						 &vma->vm_file->f_path);
 				break;
 			}
@@ -1540,7 +1540,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
 		case 0:
 			/* name was specified as a relative path and the
 			 * directory component is the cwd */
-			audit_log_d_path(ab, "name=", &context->pwd);
+			audit_log_d_path(ab, " name=", &context->pwd);
 			break;
 		default:
 			/* log the name's directory component */
@@ -1725,7 +1725,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	if (context->pwd.dentry && context->pwd.mnt) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
 		if (ab) {
-			audit_log_d_path(ab, "cwd=", &context->pwd);
+			audit_log_d_path(ab, " cwd=", &context->pwd);
 			audit_log_end(ab);
 		}
 	}
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index fc41b7cccb53..293b8c45b1d1 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -232,7 +232,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	case LSM_AUDIT_DATA_PATH: {
 		struct inode *inode;
 
-		audit_log_d_path(ab, "path=", &a->u.path);
+		audit_log_d_path(ab, " path=", &a->u.path);
 
 		inode = a->u.path.dentry->d_inode;
 		if (inode) {
@@ -318,7 +318,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 						.dentry = u->dentry,
 						.mnt = u->mnt
 					};
-					audit_log_d_path(ab, "path=", &path);
+					audit_log_d_path(ab, " path=", &path);
 					break;
 				}
 				if (!u->addr)
-- 
cgit v1.2.3-71-gd317