From 850fdec8d2fd1eebfa003fea39bec08cd69b6155 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 18 Sep 2017 12:17:57 +0200 Subject: driver core: remove DRIVER_ATTR DRIVER_ATTR is no longer in use, and driver authors should be using DRIVER_ATTR_RW() or DRIVER_ATTR_RO() or DRIVER_ATTR_WO() instead in order to always get the permissions correct. So remove it so that no one can use it anymore. Acked-by: Alan Tull Reviewed-by: Moritz Fischer Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index c6f27207dbe8..2bc70ddda09b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -307,8 +307,6 @@ struct driver_attribute { size_t count); }; -#define DRIVER_ATTR(_name, _mode, _show, _store) \ - struct driver_attribute driver_attr_##_name = __ATTR(_name, _mode, _show, _store) #define DRIVER_ATTR_RW(_name) \ struct driver_attribute driver_attr_##_name = __ATTR_RW(_name) #define DRIVER_ATTR_RO(_name) \ -- cgit v1.2.3-71-gd317 From 7fc10de8d49a748c476532c9d8e8fe19e548dd67 Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Tue, 5 Sep 2017 15:14:45 +0300 Subject: iio: ad_sigma_delta: Implement a dedicated reset function Since most of the SD ADCs have the option of reseting the serial interface by sending a number of SCLKs with CS = 0 and DIN = 1, a dedicated function that can do this is usefull. Needed for the patch: iio: ad7793: Fix the serial interface reset Signed-off-by: Dragos Bogdan Acked-by: Lars-Peter Clausen Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad_sigma_delta.c | 28 ++++++++++++++++++++++++++++ include/linux/iio/adc/ad_sigma_delta.h | 3 +++ 2 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c index d10bd0c97233..22c4c17cd996 100644 --- a/drivers/iio/adc/ad_sigma_delta.c +++ b/drivers/iio/adc/ad_sigma_delta.c @@ -177,6 +177,34 @@ out: } EXPORT_SYMBOL_GPL(ad_sd_read_reg); +/** + * ad_sd_reset() - Reset the serial interface + * + * @sigma_delta: The sigma delta device + * @reset_length: Number of SCLKs with DIN = 1 + * + * Returns 0 on success, an error code otherwise. + **/ +int ad_sd_reset(struct ad_sigma_delta *sigma_delta, + unsigned int reset_length) +{ + uint8_t *buf; + unsigned int size; + int ret; + + size = DIV_ROUND_UP(reset_length, 8); + buf = kcalloc(size, sizeof(*buf), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + memset(buf, 0xff, size); + ret = spi_write(sigma_delta->spi, buf, size); + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(ad_sd_reset); + static int ad_sd_calibrate(struct ad_sigma_delta *sigma_delta, unsigned int mode, unsigned int channel) { diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 5ba430cc9a87..1fc7abd28b0b 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -111,6 +111,9 @@ int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, int ad_sd_read_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, unsigned int size, unsigned int *val); +int ad_sd_reset(struct ad_sigma_delta *sigma_delta, + unsigned int reset_length); + int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, int *val); int ad_sd_calibrate_all(struct ad_sigma_delta *sigma_delta, -- cgit v1.2.3-71-gd317 From 237bbd29f7a049d310d907f4b2716a7feef9abf3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:03 -0700 Subject: KEYS: prevent creating a different user's keyrings It was possible for an unprivileged user to create the user and user session keyrings for another user. For example: sudo -u '#3000' sh -c 'keyctl add keyring _uid.4000 "" @u keyctl add keyring _uid_ses.4000 "" @u sleep 15' & sleep 1 sudo -u '#4000' keyctl describe @u sudo -u '#4000' keyctl describe @us This is problematic because these "fake" keyrings won't have the right permissions. In particular, the user who created them first will own them and will have full access to them via the possessor permissions, which can be used to compromise the security of a user's keys: -4: alswrv-----v------------ 3000 0 keyring: _uid.4000 -5: alswrv-----v------------ 3000 0 keyring: _uid_ses.4000 Fix it by marking user and user session keyrings with a flag KEY_FLAG_UID_KEYRING. Then, when searching for a user or user session keyring by name, skip all keyrings that don't have the flag set. Fixes: 69664cf16af4 ("keys: don't generate user and user session keyrings unless they're accessed") Cc: [v2.6.26+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- include/linux/key.h | 2 ++ security/keys/internal.h | 2 +- security/keys/key.c | 2 ++ security/keys/keyring.c | 23 ++++++++++++++--------- security/keys/process_keys.c | 6 ++++-- 5 files changed, 23 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/key.h b/include/linux/key.h index 044114185120..e315e16b6ff8 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -187,6 +187,7 @@ struct key { #define KEY_FLAG_BUILTIN 8 /* set if key is built in to the kernel */ #define KEY_FLAG_ROOT_CAN_INVAL 9 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 10 /* set if key should not be removed */ +#define KEY_FLAG_UID_KEYRING 11 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -243,6 +244,7 @@ extern struct key *key_alloc(struct key_type *type, #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ #define KEY_ALLOC_BUILT_IN 0x0004 /* Key is built into kernel */ #define KEY_ALLOC_BYPASS_RESTRICTION 0x0008 /* Override the check on restricted keyrings */ +#define KEY_ALLOC_UID_KEYRING 0x0010 /* allocating a user or user session keyring */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); diff --git a/security/keys/internal.h b/security/keys/internal.h index 1c02c6547038..503adbae7b0d 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -141,7 +141,7 @@ extern key_ref_t keyring_search_aux(key_ref_t keyring_ref, extern key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx); extern key_ref_t search_process_keyrings(struct keyring_search_context *ctx); -extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check); +extern struct key *find_keyring_by_name(const char *name, bool uid_keyring); extern int install_user_keyrings(void); extern int install_thread_keyring_to_cred(struct cred *); diff --git a/security/keys/key.c b/security/keys/key.c index 83da68d98b40..e5c0896c3a8f 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -302,6 +302,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, key->flags |= 1 << KEY_FLAG_IN_QUOTA; if (flags & KEY_ALLOC_BUILT_IN) key->flags |= 1 << KEY_FLAG_BUILTIN; + if (flags & KEY_ALLOC_UID_KEYRING) + key->flags |= 1 << KEY_FLAG_UID_KEYRING; #ifdef KEY_DEBUGGING key->magic = KEY_DEBUG_MAGIC; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 94f038967c17..4fa82a8a9c0e 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1097,15 +1097,15 @@ found: /* * Find a keyring with the specified name. * - * All named keyrings in the current user namespace are searched, provided they - * grant Search permission directly to the caller (unless this check is - * skipped). Keyrings whose usage points have reached zero or who have been - * revoked are skipped. + * Only keyrings that have nonzero refcount, are not revoked, and are owned by a + * user in the current user namespace are considered. If @uid_keyring is %true, + * the keyring additionally must have been allocated as a user or user session + * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ -struct key *find_keyring_by_name(const char *name, bool skip_perm_check) +struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct key *keyring; int bucket; @@ -1133,10 +1133,15 @@ struct key *find_keyring_by_name(const char *name, bool skip_perm_check) if (strcmp(keyring->description, name) != 0) continue; - if (!skip_perm_check && - key_permission(make_key_ref(keyring, 0), - KEY_NEED_SEARCH) < 0) - continue; + if (uid_keyring) { + if (!test_bit(KEY_FLAG_UID_KEYRING, + &keyring->flags)) + continue; + } else { + if (key_permission(make_key_ref(keyring, 0), + KEY_NEED_SEARCH) < 0) + continue; + } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 86bced9fdbdf..293d3598153b 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -77,7 +77,8 @@ int install_user_keyrings(void) if (IS_ERR(uid_keyring)) { uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); @@ -94,7 +95,8 @@ int install_user_keyrings(void) session_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); -- cgit v1.2.3-71-gd317 From 5acb3cc2c2e9d3020a4fee43763c6463767f1572 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 20 Sep 2017 13:12:20 -0600 Subject: blktrace: Fix potential deadlock between delete & sysfs ops The lockdep code had reported the following unsafe locking scenario: CPU0 CPU1 ---- ---- lock(s_active#228); lock(&bdev->bd_mutex/1); lock(s_active#228); lock(&bdev->bd_mutex); *** DEADLOCK *** The deadlock may happen when one task (CPU1) is trying to delete a partition in a block device and another task (CPU0) is accessing tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that partition. The s_active isn't an actual lock. It is a reference count (kn->count) on the sysfs (kernfs) file. Removal of a sysfs file, however, require a wait until all the references are gone. The reference count is treated like a rwsem using lockdep instrumentation code. The fact that a thread is in the sysfs callback method or in the ioctl call means there is a reference to the opended sysfs or device file. That should prevent the underlying block structure from being removed. Instead of using bd_mutex in the block_device structure, a new blk_trace_mutex is now added to the request_queue structure to protect access to the blk_trace structure. Suggested-by: Christoph Hellwig Signed-off-by: Waiman Long Acked-by: Steven Rostedt (VMware) Fix typo in patch subject line, and prune a comment detailing how the code used to work. Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +++ include/linux/blkdev.h | 1 + kernel/trace/blktrace.c | 18 ++++++++++++------ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index aebe676225e6..048be4aa6024 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -854,6 +854,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) kobject_init(&q->kobj, &blk_queue_ktype); +#ifdef CONFIG_BLK_DEV_IO_TRACE + mutex_init(&q->blk_trace_mutex); +#endif mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 460294bb0fa5..02fa42d24b52 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -551,6 +551,7 @@ struct request_queue { int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace *blk_trace; + struct mutex blk_trace_mutex; #endif /* * for flush operations diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2a685b45b73b..45a3928544ce 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start) } EXPORT_SYMBOL_GPL(blk_trace_startstop); +/* + * When reading or writing the blktrace sysfs files, the references to the + * opened sysfs or device files should prevent the underlying block device + * from being removed. So no further delete protection is really needed. + */ + /** * blk_trace_ioctl: - handle the ioctls associated with tracing * @bdev: the block device @@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); switch (cmd) { case BLKTRACESETUP: @@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); return ret; } @@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!q->blk_trace); @@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: @@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { if (value) @@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: -- cgit v1.2.3-71-gd317 From c98cb3bd882119e7e1a7c8df2f1eacfcc701450b Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:25 -0700 Subject: nvme.h: remove FC transport-specific error values The NVM express group recinded the reserved range for the transport. Remove the FC-centric values that had been defined. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 87723c86f136..2440be32be1d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1127,19 +1127,6 @@ enum { NVME_SC_UNWRITTEN_BLOCK = 0x287, NVME_SC_DNR = 0x4000, - - - /* - * FC Transport-specific error status values for NVME commands - * - * Transport-specific status code values must be in the range 0xB0..0xBF - */ - - /* Generic FC failure - catchall */ - NVME_SC_FC_TRANSPORT_ERROR = 0x00B0, - - /* I/O failure due to FC ABTS'd */ - NVME_SC_FC_TRANSPORT_ABORTED = 0x00B1, }; struct nvme_completion { -- cgit v1.2.3-71-gd317 From d85cf207499e6740ab9c490ff4f360af5c432d23 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:20:23 -0700 Subject: nvme: add transport SGL definitions Add transport SGL defintions from NVMe TP 4008, required for the final NVMe-FC standard. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2440be32be1d..9310ce77d8e1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -471,12 +471,14 @@ enum nvme_opcode { * * @NVME_SGL_FMT_ADDRESS: absolute address of the data block * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_TRANSPORT_A: transport defined format, value 0xA * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation * request subtype */ enum { NVME_SGL_FMT_ADDRESS = 0x00, NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_TRANSPORT_A = 0x0A, NVME_SGL_FMT_INVALIDATE = 0x0f, }; @@ -490,12 +492,16 @@ enum { * * For struct nvme_keyed_sgl_desc: * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + * + * Transport-specific SGL types: + * @NVME_TRANSPORT_SGL_DATA_DESC: Transport SGL data dlock descriptor */ enum { NVME_SGL_FMT_DATA_DESC = 0x00, NVME_SGL_FMT_SEG_DESC = 0x02, NVME_SGL_FMT_LAST_SEG_DESC = 0x03, NVME_KEY_SGL_FMT_DATA_DESC = 0x04, + NVME_TRANSPORT_SGL_DATA_DESC = 0x05, }; struct nvme_sgl_desc { -- cgit v1.2.3-71-gd317 From fe59493240169a2cc3f445ae5f2a2308fda06b63 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Sep 2017 14:29:15 +0200 Subject: PCI: Add dummy pci_acs_enabled() for CONFIG_PCI=n build If CONFIG_PCI=n and gcc (e.g. 4.1.2) decides not to inline get_pci_function_alias_group(), the build fails with: drivers/iommu/iommu.o: In function `get_pci_function_alias_group': iommu.c:(.text+0xfdc): undefined reference to `pci_acs_enabled' Due to the various dummies for PCI calls in the CONFIG_PCI=n case, pci_acs_enabled() never called, but not all versions of gcc are smart enough to realize that. While explicitly marking get_pci_function_alias_group() inline would fix the build, this would inflate the code for the CONFIG_PCI=y case, as get_pci_function_alias_group() is a not-so-small function called from two places. Hence fix the issue by introducing a dummy for pci_acs_enabled() instead. Fixes: 0ae349a0f33f ("iommu/qcom: Add qcom_iommu") Signed-off-by: Geert Uytterhoeven Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index f68c58a93dd0..f4f8ee5a7362 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1685,6 +1685,8 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } #define dev_is_pci(d) (false) #define dev_is_pf(d) (false) +static inline bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags) +{ return false; } #endif /* CONFIG_PCI */ /* Include architecture-dependent settings and functions */ -- cgit v1.2.3-71-gd317 From 6b71f9e1e849f82abb4a8d54ce7f4b1c71f19ac4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 20 Sep 2017 11:07:26 -0700 Subject: nvmet-fc: sync header templates with comments Comments were incorrect: - defer_rcv was in host port template. moved to target port template - Added Mandatory statements for target port template items Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme-fc-driver.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 9c5cb4480806..a726f96010d5 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -346,11 +346,6 @@ struct nvme_fc_remote_port { * indicating an FC transport Aborted status. * Entrypoint is Mandatory. * - * @defer_rcv: Called by the transport to signal the LLLD that it has - * begun processing of a previously received NVME CMD IU. The LLDD - * is now free to re-use the rcv buffer associated with the - * nvmefc_tgt_fcp_req. - * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -806,11 +801,19 @@ struct nvmet_fc_target_port { * outstanding operation (if there was one) to complete, then will * call the fcp_req_release() callback to return the command's * exchange context back to the LLDD. + * Entrypoint is Mandatory. * * @fcp_req_release: Called by the transport to return a nvmefc_tgt_fcp_req * to the LLDD after all operations on the fcp operation are complete. * This may be due to the command completing or upon completion of * abort cleanup. + * Entrypoint is Mandatory. + * + * @defer_rcv: Called by the transport to signal the LLLD that it has + * begun processing of a previously received NVME CMD IU. The LLDD + * is now free to re-use the rcv buffer associated with the + * nvmefc_tgt_fcp_req. + * Entrypoint is Optional. * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. -- cgit v1.2.3-71-gd317 From fac1c2040203363eab6c6e86ce883cb71390418f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:15 +0200 Subject: smp/hotplug: Add state diagram Add a state diagram to clarify when which states are ran where. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.661598270@infradead.org --- include/linux/cpuhotplug.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f24bfb2b9a2d..477b2e6f60f7 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -3,6 +3,24 @@ #include +/* + * CPU-up CPU-down + * + * BP AP BP AP + * + * OFFLINE OFFLINE + * | ^ + * v | + * BRINGUP_CPU->AP_OFFLINE BRINGUP_CPU <- AP_IDLE_DEAD (idle thread/play_dead) + * | AP_OFFLINE + * v (IRQ-off) ,---------------^ + * AP_ONLNE | (stop_machine) + * | TEARDOWN_CPU <- AP_ONLINE_IDLE + * | ^ + * v | + * AP_ACTIVE AP_ACTIVE + */ + enum cpuhp_state { CPUHP_OFFLINE, CPUHP_CREATE_THREADS, -- cgit v1.2.3-71-gd317 From 1db49484f21ed0fcdadd0635a3669f5f386546fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:21 +0200 Subject: smp/hotplug: Hotplug state fail injection Add a sysfs file to one-time fail a specific state. This can be used to test the state rollback code paths. Something like this (hotplug-up.sh): #!/bin/bash echo 0 > /debug/sched_debug echo 1 > /debug/tracing/events/cpuhp/enable ALL_STATES=`cat /sys/devices/system/cpu/hotplug/states | cut -d':' -f1` STATES=${1:-$ALL_STATES} for state in $STATES do echo 0 > /sys/devices/system/cpu/cpu1/online echo 0 > /debug/tracing/trace echo Fail state: $state echo $state > /sys/devices/system/cpu/cpu1/hotplug/fail cat /sys/devices/system/cpu/cpu1/hotplug/fail echo 1 > /sys/devices/system/cpu/cpu1/online cat /debug/tracing/trace > hotfail-${state}.trace sleep 1 done Can be used to test for all possible rollback (barring multi-instance) scenarios on CPU-up, CPU-down is a trivial modification of the above. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.972581715@infradead.org --- include/linux/cpuhotplug.h | 3 ++- kernel/cpu.c | 60 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 477b2e6f60f7..6d508767e144 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -22,7 +22,8 @@ */ enum cpuhp_state { - CPUHP_OFFLINE, + CPUHP_INVALID = -1, + CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, diff --git a/kernel/cpu.c b/kernel/cpu.c index 6bbe261b851f..8de11a29e495 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -52,6 +52,7 @@ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; + enum cpuhp_state fail; #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; @@ -67,7 +68,9 @@ struct cpuhp_cpu_state { #endif }; -static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { + .fail = CPUHP_INVALID, +}; #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = @@ -160,6 +163,15 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; + if (st->fail == state) { + st->fail = CPUHP_INVALID; + + if (!(bringup ? step->startup.single : step->teardown.single)) + return 0; + + return -EAGAIN; + } + if (!step->multi_instance) { WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; @@ -1805,9 +1817,55 @@ static ssize_t show_cpuhp_target(struct device *dev, } static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static ssize_t write_cpuhp_fail(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int fail, ret; + + ret = kstrtoint(buf, 10, &fail); + if (ret) + return ret; + + /* + * Cannot fail STARTING/DYING callbacks. + */ + if (cpuhp_is_atomic_state(fail)) + return -EINVAL; + + /* + * Cannot fail anything that doesn't have callbacks. + */ + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(fail); + if (!sp->startup.single && !sp->teardown.single) + ret = -EINVAL; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + st->fail = fail; + + return count; +} + +static ssize_t show_cpuhp_fail(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->fail); +} + +static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); + static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, &dev_attr_target.attr, + &dev_attr_fail.attr, NULL }; -- cgit v1.2.3-71-gd317 From 50ce6312f293e129eedf2affc7bd791c71d8287e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 26 Sep 2017 19:32:52 +0100 Subject: iommu: Fix comment for iommu_ops.map_sg The definition of map_sg was split during a recent addition to iommu_ops. Put it back together. Fixes: add02cfdc9bc ("iommu: Introduce Interface for IOMMU TLB Flushing") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a7f2ac689d29..41b8c5757859 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -167,11 +167,11 @@ struct iommu_resv_region { * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain * @map_sg: map a scatter-gather list of physically contiguous memory chunks + * to an iommu domain * @flush_tlb_all: Synchronously flush all hardware TLBs for this domain * @tlb_range_add: Add a given iova range to the flush queue for this domain * @tlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue - * to an iommu domain * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping -- cgit v1.2.3-71-gd317 From 686fef928bba6be13cabe639f154af7d72b63120 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 28 Sep 2017 06:38:17 -0700 Subject: timer: Prepare to change timer callback argument type Modern kernel callback systems pass the structure associated with a given callback to the callback function. The timer callback remains one of the legacy cases where an arbitrary unsigned long argument continues to be passed as the callback argument. This has several problems: - This bloats the timer_list structure with a normally redundant .data field. - No type checking is being performed, forcing callbacks to do explicit type casts of the unsigned long argument into the object that was passed, rather than using container_of(), as done in most of the other callback infrastructure. - Neighboring buffer overflows can overwrite both the .function and the .data field, providing attackers with a way to elevate from a buffer overflow into a simplistic ROP-like mechanism that allows calling arbitrary functions with a controlled first argument. - For future Control Flow Integrity work, this creates a unique function prototype for timer callbacks, instead of allowing them to continue to be clustered with other void functions that take a single unsigned long argument. This adds a new timer initialization API, which will ultimately replace the existing setup_timer(), setup_{deferrable,pinned,etc}_timer() family, named timer_setup() (to mirror hrtimer_setup(), making instances of its use much easier to grep for). In order to support the migration of existing timers into the new callback arguments, timer_setup() casts its arguments to the existing legacy types, and explicitly passes the timer pointer as the legacy data argument. Once all setup_*timer() callers have been replaced with timer_setup(), the casts can be removed, and the data argument can be dropped with the timer expiration code changed to just pass the timer to the callback directly. Since the regular pattern of using container_of() during local variable declaration repeats the need for the variable type declaration to be included, this adds a helper modeled after other from_*() helpers that wrap container_of(), named from_timer(). This helper uses typeof(*variable), removing the type redundancy and minimizing the need for line wraps in forthcoming conversions from "unsigned data long" to "struct timer_list *" in the timer callbacks: -void callback(unsigned long data) +void callback(struct timer_list *t) { - struct some_data_structure *local = (struct some_data_structure *)data; + struct some_data_structure *local = from_timer(local, t, timer); Finally, in order to support the handful of timer users that perform open-coded assignments of the .function (and .data) fields, provide cast macros (TIMER_FUNC_TYPE and TIMER_DATA_TYPE) that can be used temporarily. Once conversion has been completed, these can be globally trivially removed. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170928133817.GA113410@beast --- include/linux/timer.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index e6789b8757d5..6383c528b148 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -168,6 +168,20 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) +#define TIMER_DATA_TYPE unsigned long +#define TIMER_FUNC_TYPE void (*)(TIMER_DATA_TYPE) + +static inline void timer_setup(struct timer_list *timer, + void (*callback)(struct timer_list *), + unsigned int flags) +{ + __setup_timer(timer, (TIMER_FUNC_TYPE)callback, + (TIMER_DATA_TYPE)timer, flags); +} + +#define from_timer(var, callback_timer, timer_fieldname) \ + container_of(callback_timer, typeof(*var), timer_fieldname) + /** * timer_pending - is a timer pending? * @timer: the timer in question -- cgit v1.2.3-71-gd317 From 1593baab910da72480d651ea7bf2ce6e3a25a484 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:09:26 +0200 Subject: sched/debug: Implement consistent task-state printing Currently get_task_state() and task_state_to_char() report different states, create a number of common helpers and unify the reported state space. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 15 ++------------- include/linux/sched.h | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/array.c b/fs/proc/array.c index 525157ca25cb..01196d3ad452 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -130,19 +130,8 @@ static const char * const task_state_array[] = { static inline const char *get_task_state(struct task_struct *tsk) { - unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; - - /* - * Parked tasks do not run; they sit in __kthread_parkme(). - * Without this check, we would report them as running, which is - * clearly wrong, so we report them as sleeping instead. - */ - if (tsk->state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; - - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); - - return task_state_array[fls(state)]; + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array) - 1); + return task_state_array[__get_task_state(tsk)]; } static inline int get_task_umask(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index 92fb8dd5a9e4..163a0b738908 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1243,17 +1243,29 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } -static inline char task_state_to_char(struct task_struct *task) +static inline unsigned int __get_task_state(struct task_struct *tsk) { - const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - unsigned long state = task->state; + unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; - state = state ? __ffs(state) + 1 : 0; + if (tsk_state == TASK_PARKED) + state = TASK_INTERRUPTIBLE; - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); + return fls(state); +} + +static inline char __task_state_to_char(unsigned int state) +{ + static const char state_char[] = "RSDTtXZ"; + + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 2); - return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'; + return state_char[state]; +} + +static inline char task_state_to_char(struct task_struct *tsk) +{ + return __task_state_to_char(__get_task_state(tsk)); } /** -- cgit v1.2.3-71-gd317 From 92c4bc9f9cd92a8581e36bc5105f03b569f37e36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:13:36 +0200 Subject: sched/debug: Convert TASK_state to hex Bit patterns are easier in hex. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 163a0b738908..69bed5339ffa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -65,23 +65,23 @@ struct task_group; */ /* Used in tsk->state: */ -#define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define __TASK_STOPPED 4 -#define __TASK_TRACED 8 +#define TASK_RUNNING 0x0000 +#define TASK_INTERRUPTIBLE 0x0001 +#define TASK_UNINTERRUPTIBLE 0x0002 +#define __TASK_STOPPED 0x0004 +#define __TASK_TRACED 0x0008 /* Used in tsk->exit_state: */ -#define EXIT_DEAD 16 -#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 0x0010 +#define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 64 -#define TASK_WAKEKILL 128 -#define TASK_WAKING 256 -#define TASK_PARKED 512 -#define TASK_NOLOAD 1024 -#define TASK_NEW 2048 -#define TASK_STATE_MAX 4096 +#define TASK_DEAD 0x0040 +#define TASK_WAKEKILL 0x0080 +#define TASK_WAKING 0x0100 +#define TASK_PARKED 0x0200 +#define TASK_NOLOAD 0x0400 +#define TASK_NEW 0x0800 +#define TASK_STATE_MAX 0x1000 #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" -- cgit v1.2.3-71-gd317 From efb40f588b4370ffaeffafbd50f6ff213d954254 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:19:53 +0200 Subject: sched/tracing: Fix trace_sched_switch task-state printing Convert trace_sched_switch to use the common task-state helpers and fix the "X" and "Z" order, possibly they ended up in the wrong order because TASK_REPORT has them in the wrong order too. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- include/trace/events/sched.h | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 69bed5339ffa..a2fe636b6825 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -99,7 +99,7 @@ struct task_group; /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ae1409ffe99a..c63e20c9ef24 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -114,7 +114,10 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * * Preemption ignores task state, therefore preempted tasks are always * RUNNING (we will not have dequeued if state != RUNNING). */ - return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state; + if (preempt) + return TASK_STATE_MAX; + + return __get_task_state(p); } #endif /* CREATE_TRACE_POINTS */ @@ -152,12 +155,13 @@ TRACE_EVENT(sched_switch, TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->prev_state & (TASK_STATE_MAX-1) ? - __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", - { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, - { 16, "Z" }, { 32, "X" }, { 64, "x" }, - { 128, "K" }, { 256, "W" }, { 512, "P" }, - { 1024, "N" }) : "R", + + (__entry->prev_state & TASK_REPORT) ? + __print_flags(__entry->prev_state & TASK_REPORT, "|", + { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, + { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }) : + "R", + __entry->prev_state & TASK_STATE_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); -- cgit v1.2.3-71-gd317 From 5f6ad26ea353fdf3dad2328052cbee49e0b9c5b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:23:31 +0200 Subject: sched/tracing: Use common task-state helpers Remove yet another task-state char instance. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- kernel/trace/trace_output.c | 21 ++++++--------------- kernel/trace/trace_sched_wakeup.c | 8 ++++---- 3 files changed, 10 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a2fe636b6825..bc7807933415 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -83,8 +83,6 @@ struct task_group; #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" - /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bac629af2285..c738e764e2a5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); + T = __task_state_to_char(field->next_state); + S = __task_state_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ddec53b67646..0c331978b1a6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = prev->state; + entry->prev_state = __get_task_state(prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = next->state; + entry->next_state = __get_task_state(next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = curr->state; + entry->prev_state = __get_task_state(curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = wakee->state; + entry->next_state = __get_task_state(wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) -- cgit v1.2.3-71-gd317 From 06eb61844d841d0032a9950ce7f8e783ee49c0d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:30:40 +0200 Subject: sched/debug: Add explicit TASK_IDLE printing Markus reported that kthreads that idle using TASK_IDLE instead of TASK_INTERRUPTIBLE are reported in as TASK_UNINTERRUPTIBLE and things like htop mark those red. This is undesirable, so add an explicit state for TASK_IDLE. Reported-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 21 +++++++++++++-------- include/linux/sched.h | 12 ++++++++++-- include/trace/events/sched.h | 7 ++++--- 3 files changed, 27 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/array.c b/fs/proc/array.c index 01196d3ad452..a120a4549d48 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -119,18 +119,23 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char * const task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "t (tracing stop)", /* 8 */ - "X (dead)", /* 16 */ - "Z (zombie)", /* 32 */ + + /* states in TASK_REPORT: */ + "R (running)", /* 0x00 */ + "S (sleeping)", /* 0x01 */ + "D (disk sleep)", /* 0x02 */ + "T (stopped)", /* 0x04 */ + "t (tracing stop)", /* 0x08 */ + "X (dead)", /* 0x10 */ + "Z (zombie)", /* 0x20 */ + + /* states beyond TASK_REPORT: */ + "I (idle)", /* 0x40 */ }; static inline const char *get_task_state(struct task_struct *tsk) { - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array) - 1); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); return task_state_array[__get_task_state(tsk)]; } diff --git a/include/linux/sched.h b/include/linux/sched.h index bc7807933415..286fc1117046 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1241,22 +1241,30 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } +#define TASK_REPORT_IDLE (TASK_REPORT + 1) +#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) + static inline unsigned int __get_task_state(struct task_struct *tsk) { unsigned int tsk_state = READ_ONCE(tsk->state); unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; + BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); + if (tsk_state == TASK_PARKED) state = TASK_INTERRUPTIBLE; + if (tsk_state == TASK_IDLE) + state = TASK_REPORT_IDLE; + return fls(state); } static inline char __task_state_to_char(unsigned int state) { - static const char state_char[] = "RSDTtXZ"; + static const char state_char[] = "RSDTtXZI"; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 2); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); return state_char[state]; } diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c63e20c9ef24..b371ef8206e1 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -156,10 +156,11 @@ TRACE_EVENT(sched_switch, TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - (__entry->prev_state & TASK_REPORT) ? - __print_flags(__entry->prev_state & TASK_REPORT, "|", + (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? + __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, - { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }) : + { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }, + { 0x40, "I" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", -- cgit v1.2.3-71-gd317 From 8ef9925b02c23e3838d5e593c5cf37984141150f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:37:28 +0200 Subject: sched/debug: Add explicit TASK_PARKED printing Currently TASK_PARKED is masqueraded as TASK_INTERRUPTIBLE, give it its own print state because it will not in fact get woken by regular wakeups and is a long-term state. This requires moving TASK_PARKED into the TASK_REPORT mask, and since that latter needs to be a contiguous bitmask, we need to shuffle the bits around a bit. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 3 ++- include/linux/sched.h | 16 +++++++--------- include/trace/events/sched.h | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/array.c b/fs/proc/array.c index a120a4549d48..77a8eacbe032 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -128,9 +128,10 @@ static const char * const task_state_array[] = { "t (tracing stop)", /* 0x08 */ "X (dead)", /* 0x10 */ "Z (zombie)", /* 0x20 */ + "P (parked)", /* 0x40 */ /* states beyond TASK_REPORT: */ - "I (idle)", /* 0x40 */ + "I (idle)", /* 0x80 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index 286fc1117046..26a7df4e558c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -75,10 +75,10 @@ struct task_group; #define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 0x0040 -#define TASK_WAKEKILL 0x0080 -#define TASK_WAKING 0x0100 -#define TASK_PARKED 0x0200 +#define TASK_PARKED 0x0040 +#define TASK_DEAD 0x0080 +#define TASK_WAKEKILL 0x0100 +#define TASK_WAKING 0x0200 #define TASK_NOLOAD 0x0400 #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 @@ -97,7 +97,8 @@ struct task_group; /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) @@ -1251,9 +1252,6 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); - if (tsk_state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; - if (tsk_state == TASK_IDLE) state = TASK_REPORT_IDLE; @@ -1262,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) static inline char __task_state_to_char(unsigned int state) { - static const char state_char[] = "RSDTtXZI"; + static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index b371ef8206e1..3c8b7f625670 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -160,7 +160,7 @@ TRACE_EVENT(sched_switch, __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }, - { 0x40, "I" }) : + { 0x40, "P" }, { 0x80, "I" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", -- cgit v1.2.3-71-gd317