From 3859a271a003aba01e45b85c9d8b355eb7bf25f9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 28 Oct 2016 01:22:25 -0700 Subject: randstruct: Mark various structs for randomization This marks many critical kernel structures for randomization. These are structures that have been targeted in the past in security exploits, or contain functions pointers, pointers to function pointer tables, lists, workqueues, ref-counters, credentials, permissions, or are otherwise sensitive. This initial list was extracted from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Left out of this list is task_struct, which requires special handling and will be covered in a subsequent patch. Signed-off-by: Kees Cook --- include/linux/binfmts.h | 4 ++-- include/linux/cdev.h | 2 +- include/linux/cred.h | 4 ++-- include/linux/dcache.h | 2 +- include/linux/fs.h | 17 +++++++++-------- include/linux/fs_struct.h | 2 +- include/linux/ipc.h | 2 +- include/linux/ipc_namespace.h | 2 +- include/linux/key-type.h | 4 ++-- include/linux/kmod.h | 2 +- include/linux/kobject.h | 2 +- include/linux/lsm_hooks.h | 4 ++-- include/linux/mm_types.h | 4 ++-- include/linux/module.h | 4 ++-- include/linux/mount.h | 2 +- include/linux/msg.h | 2 +- include/linux/path.h | 2 +- include/linux/pid_namespace.h | 2 +- include/linux/proc_ns.h | 2 +- include/linux/sched.h | 2 +- include/linux/sched/signal.h | 2 +- include/linux/sem.h | 2 +- include/linux/shm.h | 2 +- include/linux/sysctl.h | 2 +- include/linux/tty.h | 2 +- include/linux/tty_driver.h | 4 ++-- include/linux/user_namespace.h | 2 +- include/linux/utsname.h | 2 +- include/net/af_unix.h | 2 +- include/net/neighbour.h | 2 +- include/net/net_namespace.h | 2 +- include/net/sock.h | 2 +- 32 files changed, 47 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 05488da3aee9..3ae9013eeaaa 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -46,7 +46,7 @@ struct linux_binprm { unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; -}; +} __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT) @@ -81,7 +81,7 @@ struct linux_binfmt { int (*load_shlib)(struct file *); int (*core_dump)(struct coredump_params *cprm); unsigned long min_coredump; /* minimal dump size */ -}; +} __randomize_layout; extern void __register_binfmt(struct linux_binfmt *fmt, int insert); diff --git a/include/linux/cdev.h b/include/linux/cdev.h index 408bc09ce497..cb28eb21e3ca 100644 --- a/include/linux/cdev.h +++ b/include/linux/cdev.h @@ -17,7 +17,7 @@ struct cdev { struct list_head list; dev_t dev; unsigned int count; -}; +} __randomize_layout; void cdev_init(struct cdev *, const struct file_operations *); diff --git a/include/linux/cred.h b/include/linux/cred.h index b03e7d049a64..82c8a9e1aabb 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -31,7 +31,7 @@ struct group_info { atomic_t usage; int ngroups; kgid_t gid[0]; -}; +} __randomize_layout; /** * get_group_info - Get a reference to a group info structure @@ -145,7 +145,7 @@ struct cred { struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ struct rcu_head rcu; /* RCU deletion hook */ -}; +} __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d2e38dc6172c..7eb262e13d3c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -113,7 +113,7 @@ struct dentry { struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; -}; +} __randomize_layout; /* * dentry->d_lock spinlock nesting subclasses: diff --git a/include/linux/fs.h b/include/linux/fs.h index 803e5a9b2654..8f28143486c4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -275,7 +275,7 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; -}; +} __randomize_layout; static inline bool is_sync_kiocb(struct kiocb *kiocb) { @@ -392,7 +392,7 @@ struct address_space { gfp_t gfp_mask; /* implicit gfp mask for allocations */ struct list_head private_list; /* ditto */ void *private_data; /* ditto */ -} __attribute__((aligned(sizeof(long)))); +} __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit @@ -435,7 +435,7 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; -}; +} __randomize_layout; /* * Radix-tree tags, for tagging dirty and writeback pages within the pagecache @@ -653,7 +653,7 @@ struct inode { #endif void *i_private; /* fs or device private pointer */ -}; +} __randomize_layout; static inline unsigned int i_blocksize(const struct inode *node) { @@ -868,7 +868,8 @@ struct file { struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; -} __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ +} __randomize_layout + __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; @@ -1005,7 +1006,7 @@ struct file_lock { int state; /* state of grant or error if -ve */ } afs; } fl_u; -}; +} __randomize_layout; struct file_lock_context { spinlock_t flc_lock; @@ -1404,7 +1405,7 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ -}; +} __randomize_layout; /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can @@ -1690,7 +1691,7 @@ struct file_operations { u64); ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, u64); -}; +} __randomize_layout; struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 0efc3e62843a..7a026240cbb1 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -12,7 +12,7 @@ struct fs_struct { int umask; int in_exec; struct path root, pwd; -}; +} __randomize_layout; extern struct kmem_cache *fs_cachep; diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 71fd92d81b26..ea0eb0b5f98c 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -20,6 +20,6 @@ struct kern_ipc_perm { umode_t mode; unsigned long seq; void *security; -} ____cacheline_aligned_in_smp; +} ____cacheline_aligned_in_smp __randomize_layout; #endif /* _LINUX_IPC_H */ diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 848e5796400e..65327ee0936b 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -61,7 +61,7 @@ struct ipc_namespace { struct ucounts *ucounts; struct ns_common ns; -}; +} __randomize_layout; extern struct ipc_namespace init_ipc_ns; extern spinlock_t mq_lock; diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 8496cf64575c..9520fc3c3b9a 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -45,7 +45,7 @@ struct key_preparsed_payload { size_t datalen; /* Raw datalen */ size_t quotalen; /* Quota length for proposed payload */ time_t expiry; /* Expiry time of key */ -}; +} __randomize_layout; typedef int (*request_key_actor_t)(struct key_construction *key, const char *op, void *aux); @@ -158,7 +158,7 @@ struct key_type { /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ -}; +} __randomize_layout; extern struct key_type key_type_keyring; diff --git a/include/linux/kmod.h b/include/linux/kmod.h index c4e441e00db5..655082c88fd9 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -64,7 +64,7 @@ struct subprocess_info { int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; -}; +} __randomize_layout; extern int call_usermodehelper(const char *path, char **argv, char **envp, int wait); diff --git a/include/linux/kobject.h b/include/linux/kobject.h index ca85cb80e99a..084513350317 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -172,7 +172,7 @@ struct kset { spinlock_t list_lock; struct kobject kobj; const struct kset_uevent_ops *uevent_ops; -}; +} __randomize_layout; extern void kset_init(struct kset *kset); extern int __must_check kset_register(struct kset *kset); diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 080f34e66017..565163fc9ad4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1876,7 +1876,7 @@ struct security_hook_heads { struct list_head audit_rule_match; struct list_head audit_rule_free; #endif /* CONFIG_AUDIT */ -}; +} __randomize_layout; /* * Security module hook list structure. @@ -1887,7 +1887,7 @@ struct security_hook_list { struct list_head *head; union security_list_options hook; char *lsm; -}; +} __randomize_layout; /* * Initializing a security_hook_list structure takes diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 45cdb27791a3..ff151814a02d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -342,7 +342,7 @@ struct vm_area_struct { struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -}; +} __randomize_layout; struct core_thread { struct task_struct *task; @@ -500,7 +500,7 @@ struct mm_struct { atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; -}; +} __randomize_layout; extern struct mm_struct init_mm; diff --git a/include/linux/module.h b/include/linux/module.h index 21f56393602f..d93111d7def6 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -45,7 +45,7 @@ struct module_kobject { struct kobject *drivers_dir; struct module_param_attrs *mp; struct completion *kobj_completion; -}; +} __randomize_layout; struct module_attribute { struct attribute attr; @@ -475,7 +475,7 @@ struct module { ctor_fn_t *ctors; unsigned int num_ctors; #endif -} ____cacheline_aligned; +} ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} #endif diff --git a/include/linux/mount.h b/include/linux/mount.h index 8e0352af06b7..1ce85e6fd95f 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -67,7 +67,7 @@ struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; -}; +} __randomize_layout; struct file; /* forward dec */ struct path; diff --git a/include/linux/msg.h b/include/linux/msg.h index f3f302f9c197..a001305f5a79 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -29,7 +29,7 @@ struct msg_queue { struct list_head q_messages; struct list_head q_receivers; struct list_head q_senders; -}; +} __randomize_layout; /* Helper routines for sys_msgsnd and sys_msgrcv */ extern long do_msgsnd(int msqid, long mtype, void __user *mtext, diff --git a/include/linux/path.h b/include/linux/path.h index d1372186f431..cde895cc4af4 100644 --- a/include/linux/path.h +++ b/include/linux/path.h @@ -7,7 +7,7 @@ struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; -}; +} __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c2a989dee876..b09136f88cf4 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -52,7 +52,7 @@ struct pid_namespace { int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; -}; +} __randomize_layout; extern struct pid_namespace init_pid_ns; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 58ab28d81fc2..06844b54dfc1 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -21,7 +21,7 @@ struct proc_ns_operations { int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); struct user_namespace *(*owner)(struct ns_common *ns); struct ns_common *(*get_parent)(struct ns_common *ns); -}; +} __randomize_layout; extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b69fc650201..f833254fce00 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -408,7 +408,7 @@ struct sched_rt_entity { /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif -}; +} __randomize_layout; struct sched_dl_entity { struct rb_node rb_node; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index c06d63b3a583..2a0dd40b15db 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -222,7 +222,7 @@ struct signal_struct { struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) */ -}; +} __randomize_layout; /* * Bits in flags field of signal_struct. diff --git a/include/linux/sem.h b/include/linux/sem.h index 9edec926e9d9..23bcbdfad4a6 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -21,7 +21,7 @@ struct sem_array { int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ unsigned int use_global_lock;/* >0: global lock required */ -}; +} __randomize_layout; #ifdef CONFIG_SYSVIPC diff --git a/include/linux/shm.h b/include/linux/shm.h index 04e881829625..0fb7061ec54c 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -22,7 +22,7 @@ struct shmid_kernel /* private to the kernel */ /* The task created the shm object. NULL if the task is dead. */ struct task_struct *shm_creator; struct list_head shm_clist; /* list by creator */ -}; +} __randomize_layout; /* shm_mode upper byte flags */ #define SHM_DEST 01000 /* segment will be destroyed on last detach */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 80d07816def0..9ddeef2c03e2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -117,7 +117,7 @@ struct ctl_table struct ctl_table_poll *poll; void *extra1; void *extra2; -}; +} __randomize_layout; struct ctl_node { struct rb_node node; diff --git a/include/linux/tty.h b/include/linux/tty.h index d07cd2105a6c..73f8d0977bb0 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -333,7 +333,7 @@ struct tty_struct { /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; -}; +} __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index b742b5e47cc2..00b2213f6a35 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -291,7 +291,7 @@ struct tty_operations { void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif const struct file_operations *proc_fops; -}; +} __randomize_layout; struct tty_driver { int magic; /* magic number for this structure */ @@ -325,7 +325,7 @@ struct tty_driver { const struct tty_operations *ops; struct list_head tty_drivers; -}; +} __randomize_layout; extern struct list_head tty_drivers; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 32354b4b4b2b..b3575ce29148 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -66,7 +66,7 @@ struct user_namespace { #endif struct ucounts *ucounts; int ucount_max[UCOUNT_COUNTS]; -}; +} __randomize_layout; struct ucounts { struct hlist_node node; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 60f0bb83b313..da826ed059cf 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -26,7 +26,7 @@ struct uts_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; -}; +} __randomize_layout; extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS diff --git a/include/net/af_unix.h b/include/net/af_unix.h index fd60eccb59a6..64e2a1e24a2c 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -36,7 +36,7 @@ struct unix_skb_parms { u32 secid; /* Security ID */ #endif u32 consumed; -}; +} __randomize_layout; #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e4dd3a214034..a62959d2b3f7 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -155,7 +155,7 @@ struct neighbour { struct rcu_head rcu; struct net_device *dev; u8 primary_key[0]; -}; +} __randomize_layout; struct neigh_ops { int family; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fe80bb48ab1f..a224196d16ac 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -147,7 +147,7 @@ struct net { #endif struct sock *diag_nlsk; atomic_t fnhe_genid; -}; +} __randomize_layout; #include diff --git a/include/net/sock.h b/include/net/sock.h index f33e3d134e0b..d349297db9e9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1113,7 +1113,7 @@ struct proto { atomic_t socks; #endif int (*diag_destroy)(struct sock *sk, int err); -}; +} __randomize_layout; int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); -- cgit v1.2.3-71-gd317 From 29e48ce87f1eaaa4b1fe3d9af90c586ac2d1fb74 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Apr 2017 22:43:33 -0700 Subject: task_struct: Allow randomized layout This marks most of the layout of task_struct as randomizable, but leaves thread_info and scheduler state untouched at the start, and thread_struct untouched at the end. Other parts of the kernel use unnamed structures, but the 0-day builder using gcc-4.4 blows up on static initializers. Officially, it's documented as only working on gcc 4.6 and later, which further confuses me: https://gcc.gnu.org/wiki/C11Status The structure layout randomization already requires gcc 4.7, but instead of depending on the plugin being enabled, just check the gcc versions for wider build testing. At Linus's suggestion, the marking is hidden in a macro to reduce how ugly it looks. Additionally, indenting is left unchanged since it would make things harder to read. Randomization of task_struct is modified from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Cc: Linus Torvalds Signed-off-by: Kees Cook --- include/linux/compiler-gcc.h | 13 ++++++++++++- include/linux/compiler.h | 5 +++++ include/linux/sched.h | 14 ++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7deaae3dc87d..c4a66c036692 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -231,6 +231,7 @@ #endif /* GCC_VERSION >= 40500 */ #if GCC_VERSION >= 40600 + /* * When used with Link Time Optimization, gcc can optimize away C functions or * variables which are referenced only from assembly code. __visible tells the @@ -238,7 +239,17 @@ * this. */ #define __visible __attribute__((externally_visible)) -#endif + +/* + * RANDSTRUCT_PLUGIN wants to use an anonymous struct, but it is only + * possible since GCC 4.6. To provide as much build testing coverage + * as possible, this is used for all GCC 4.6+ builds, and not just on + * RANDSTRUCT_PLUGIN builds. + */ +#define randomized_struct_fields_start struct { +#define randomized_struct_fields_end } __randomize_layout; + +#endif /* GCC_VERSION >= 40600 */ #if GCC_VERSION >= 40900 && !defined(__CHECKER__) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 55ee9ee814f8..0b4ac3e8c63e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -456,6 +456,11 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s # define __no_randomize_layout #endif +#ifndef randomized_struct_fields_start +# define randomized_struct_fields_start +# define randomized_struct_fields_end +#endif + /* * Tell gcc if a function is cold. The compiler will assume any path * directly leading to the call is unlikely. diff --git a/include/linux/sched.h b/include/linux/sched.h index f833254fce00..e2ad3531e7fe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -490,6 +490,13 @@ struct task_struct { #endif /* -1 unrunnable, 0 runnable, >0 stopped: */ volatile long state; + + /* + * This begins the randomizable portion of task_struct. Only + * scheduling-critical items should be added above here. + */ + randomized_struct_fields_start + void *stack; atomic_t usage; /* Per task flags (PF_*), defined further below: */ @@ -1051,6 +1058,13 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void *security; #endif + + /* + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. + */ + randomized_struct_fields_end + /* CPU-specific state of this task: */ struct thread_struct thread; -- cgit v1.2.3-71-gd317 From a7b8829d242b1a58107e9c02b09e93aec446d55c Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 5 Jul 2017 20:30:01 +0200 Subject: iio: accel: st_accel: add SPI-3wire support Add SPI Serial Interface Mode (SIM) register information in st_sensor_settings look up table to support devices (like LSM303AGR accel sensor) that allow just SPI-3wire communication mode. SIM mode has to be configured before any other operation since it is not enabled by default and the driver is not able to read without that configuration Whilst a fairly substantial patch, the actual logic is simple and it is better to have the generic fix than a band aid. Fixes: ddc05fa28606 (iio: st-accel: add support for lsm303agr accel) Signed-off-by: Lorenzo Bianconi Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/accel/st_accel_core.c | 32 +++++++++++++++++++++++++ drivers/iio/common/st_sensors/st_sensors_core.c | 29 ++++++++++++++++++++++ include/linux/iio/common/st_sensors.h | 7 ++++++ include/linux/platform_data/st_sensors_pdata.h | 2 ++ 4 files changed, 70 insertions(+) (limited to 'include') diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c index 784670e2736b..2ee3ae11eb2a 100644 --- a/drivers/iio/accel/st_accel_core.c +++ b/drivers/iio/accel/st_accel_core.c @@ -166,6 +166,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_ihl = 0x02, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x23, + .value = BIT(0), + }, .multi_read_bit = true, .bootime = 2, }, @@ -234,6 +238,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_od = 0x40, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x23, + .value = BIT(0), + }, .multi_read_bit = true, .bootime = 2, }, @@ -316,6 +324,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .en_mask = 0x08, }, }, + .sim = { + .addr = 0x24, + .value = BIT(0), + }, .multi_read_bit = false, .bootime = 2, }, @@ -379,6 +391,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_int1 = 0x04, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x21, + .value = BIT(1), + }, .multi_read_bit = true, .bootime = 2, /* guess */ }, @@ -437,6 +453,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_od = 0x40, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x21, + .value = BIT(7), + }, .multi_read_bit = false, .bootime = 2, /* guess */ }, @@ -499,6 +519,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .addr_ihl = 0x22, .mask_ihl = 0x80, }, + .sim = { + .addr = 0x23, + .value = BIT(0), + }, .multi_read_bit = true, .bootime = 2, }, @@ -547,6 +571,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_int1 = 0x04, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x21, + .value = BIT(1), + }, .multi_read_bit = false, .bootime = 2, }, @@ -614,6 +642,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { .mask_ihl = 0x02, .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, }, + .sim = { + .addr = 0x23, + .value = BIT(0), + }, .multi_read_bit = true, .bootime = 2, }, diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c index 79c8c7cd70d5..6e6a1ecc99dd 100644 --- a/drivers/iio/common/st_sensors/st_sensors_core.c +++ b/drivers/iio/common/st_sensors/st_sensors_core.c @@ -550,6 +550,31 @@ out: } EXPORT_SYMBOL(st_sensors_read_info_raw); +static int st_sensors_init_interface_mode(struct iio_dev *indio_dev, + const struct st_sensor_settings *sensor_settings) +{ + struct st_sensor_data *sdata = iio_priv(indio_dev); + struct device_node *np = sdata->dev->of_node; + struct st_sensors_platform_data *pdata; + + pdata = (struct st_sensors_platform_data *)sdata->dev->platform_data; + if (((np && of_property_read_bool(np, "spi-3wire")) || + (pdata && pdata->spi_3wire)) && sensor_settings->sim.addr) { + int err; + + err = sdata->tf->write_byte(&sdata->tb, sdata->dev, + sensor_settings->sim.addr, + sensor_settings->sim.value); + if (err < 0) { + dev_err(&indio_dev->dev, + "failed to init interface mode\n"); + return err; + } + } + + return 0; +} + int st_sensors_check_device_support(struct iio_dev *indio_dev, int num_sensors_list, const struct st_sensor_settings *sensor_settings) @@ -574,6 +599,10 @@ int st_sensors_check_device_support(struct iio_dev *indio_dev, return -ENODEV; } + err = st_sensors_init_interface_mode(indio_dev, &sensor_settings[i]); + if (err < 0) + return err; + if (sensor_settings[i].wai_addr) { err = sdata->tf->read_byte(&sdata->tb, sdata->dev, sensor_settings[i].wai_addr, &wai); diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index 497f2b3a5a62..97f1b465d04f 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -105,6 +105,11 @@ struct st_sensor_fullscale { struct st_sensor_fullscale_avl fs_avl[ST_SENSORS_FULLSCALE_AVL_MAX]; }; +struct st_sensor_sim { + u8 addr; + u8 value; +}; + /** * struct st_sensor_bdu - ST sensor device block data update * @addr: address of the register. @@ -197,6 +202,7 @@ struct st_sensor_transfer_function { * @bdu: Block data update register. * @das: Data Alignment Selection register. * @drdy_irq: Data ready register of the sensor. + * @sim: SPI serial interface mode register of the sensor. * @multi_read_bit: Use or not particular bit for [I2C/SPI] multi-read. * @bootime: samples to discard when sensor passing from power-down to power-up. */ @@ -213,6 +219,7 @@ struct st_sensor_settings { struct st_sensor_bdu bdu; struct st_sensor_das das; struct st_sensor_data_ready_irq drdy_irq; + struct st_sensor_sim sim; bool multi_read_bit; unsigned int bootime; }; diff --git a/include/linux/platform_data/st_sensors_pdata.h b/include/linux/platform_data/st_sensors_pdata.h index 79b0e4cdb814..f8274b0c6888 100644 --- a/include/linux/platform_data/st_sensors_pdata.h +++ b/include/linux/platform_data/st_sensors_pdata.h @@ -17,10 +17,12 @@ * Available only for accelerometer and pressure sensors. * Accelerometer DRDY on LSM330 available only on pin 1 (see datasheet). * @open_drain: set the interrupt line to be open drain if possible. + * @spi_3wire: enable spi-3wire mode. */ struct st_sensors_platform_data { u8 drdy_int_pin; bool open_drain; + bool spi_3wire; }; #endif /* ST_SENSORS_PDATA_H */ -- cgit v1.2.3-71-gd317 From 7cfdfdc82a467c78af9132cb9c98e84415df34bc Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 10 Jul 2017 14:45:20 +0900 Subject: libata: Cleanup ata_read_log_page() The warning message "READ LOG DMA EXT failed, trying unqueued" in ata_read_log_page() as well as the macro name ATA_HORKAGE_NO_NCQ_LOG are confusing: the command READ LOG DMA EXT is not an queued NCQ command unless it is encapsulated in a RECEIVE FPDMA QUEUED command. From ACS-4 READ LOG DMA EXT description: "The device processes the READ LOG DMA EXT command in the NCQ feature set environment (see 4.13.6) if the READ LOG DMA EXT command is encapsulated in a RECEIVE FPDMA QUEUED command (see 7.30) with the inputs encapsulated as shown in 7.23.6." To avoid confusion, fix the warning messsage to mention switching to PIO and not "unqueued" and rename the macro ATA_HORKAGE_NO_NCQ_LOG to ATA_HORKAGE_NO_DMA_LOG. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Signed-off-by: Tejun Heo --- drivers/ata/libata-core.c | 6 +++--- include/linux/libata.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 8453f9a4682f..fa7dd4394c02 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2083,7 +2083,7 @@ unsigned int ata_read_log_page(struct ata_device *dev, u8 log, retry: ata_tf_init(dev, &tf); if (dev->dma_mode && ata_id_has_read_log_dma_ext(dev->id) && - !(dev->horkage & ATA_HORKAGE_NO_NCQ_LOG)) { + !(dev->horkage & ATA_HORKAGE_NO_DMA_LOG)) { tf.command = ATA_CMD_READ_LOG_DMA_EXT; tf.protocol = ATA_PROT_DMA; dma = true; @@ -2102,8 +2102,8 @@ retry: buf, sectors * ATA_SECT_SIZE, 0); if (err_mask && dma) { - dev->horkage |= ATA_HORKAGE_NO_NCQ_LOG; - ata_dev_warn(dev, "READ LOG DMA EXT failed, trying unqueued\n"); + dev->horkage |= ATA_HORKAGE_NO_DMA_LOG; + ata_dev_warn(dev, "READ LOG DMA EXT failed, trying PIO\n"); goto retry; } diff --git a/include/linux/libata.h b/include/linux/libata.h index 55de3da58b1c..931c32f1f18d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -435,7 +435,7 @@ enum { ATA_HORKAGE_NOLPM = (1 << 20), /* don't use LPM */ ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21), /* some WDs have broken LPM */ ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */ - ATA_HORKAGE_NO_NCQ_LOG = (1 << 23), /* don't use NCQ for log read */ + ATA_HORKAGE_NO_DMA_LOG = (1 << 23), /* don't use DMA for log read */ ATA_HORKAGE_NOTRIM = (1 << 24), /* don't use TRIM */ ATA_HORKAGE_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ -- cgit v1.2.3-71-gd317 From 2683701201ddb9a2a4d0fda5d950ab50ca8e77e4 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Thu, 13 Jul 2017 16:46:43 +0200 Subject: netlink: correctly document nla_put_u64_64bit() Signed-off-by: Rolf Eike Beer Signed-off-by: David S. Miller --- include/net/netlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 01709172b3d3..ef8e6c3a80a6 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -98,8 +98,8 @@ * nla_put_u8(skb, type, value) add u8 attribute to skb * nla_put_u16(skb, type, value) add u16 attribute to skb * nla_put_u32(skb, type, value) add u32 attribute to skb - * nla_put_u64_64bits(skb, type, - * value, padattr) add u64 attribute to skb + * nla_put_u64_64bit(skb, type, + * value, padattr) add u64 attribute to skb * nla_put_s8(skb, type, value) add s8 attribute to skb * nla_put_s16(skb, type, value) add s16 attribute to skb * nla_put_s32(skb, type, value) add s32 attribute to skb -- cgit v1.2.3-71-gd317 From 301bfa483016d48b7fb9cbad87c0a04a15c25b90 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Jul 2017 17:53:48 -0400 Subject: NFS: Don't run wake_up_bit() when nobody is waiting... "perf lock" shows fairly heavy contention for the bit waitqueue locks when doing an I/O heavy workload. Use a bit to tell whether or not there has been contention for a lock so that we can optimise away the bit waitqueue options in those cases. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pagelist.c | 17 ++++++++++++++++- include/linux/nfs_page.h | 2 ++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 8a23e2b40b04..de9066a92c0d 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -155,9 +155,12 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock) if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) return 0; - if (!nonblock) + if (!nonblock) { + set_bit(PG_CONTENDED1, &head->wb_flags); + smp_mb__after_atomic(); return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); + } return -EAGAIN; } @@ -175,6 +178,10 @@ nfs_page_group_lock_wait(struct nfs_page *req) WARN_ON_ONCE(head != head->wb_head); + if (!test_bit(PG_HEADLOCK, &head->wb_flags)) + return; + set_bit(PG_CONTENDED1, &head->wb_flags); + smp_mb__after_atomic(); wait_on_bit(&head->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); } @@ -193,6 +200,8 @@ nfs_page_group_unlock(struct nfs_page *req) smp_mb__before_atomic(); clear_bit(PG_HEADLOCK, &head->wb_flags); smp_mb__after_atomic(); + if (!test_bit(PG_CONTENDED1, &head->wb_flags)) + return; wake_up_bit(&head->wb_flags, PG_HEADLOCK); } @@ -383,6 +392,8 @@ void nfs_unlock_request(struct nfs_page *req) smp_mb__before_atomic(); clear_bit(PG_BUSY, &req->wb_flags); smp_mb__after_atomic(); + if (!test_bit(PG_CONTENDED2, &req->wb_flags)) + return; wake_up_bit(&req->wb_flags, PG_BUSY); } @@ -465,6 +476,10 @@ void nfs_release_request(struct nfs_page *req) int nfs_wait_on_request(struct nfs_page *req) { + if (!test_bit(PG_BUSY, &req->wb_flags)) + return 0; + set_bit(PG_CONTENDED2, &req->wb_flags); + smp_mb__after_atomic(); return wait_on_bit_io(&req->wb_flags, PG_BUSY, TASK_UNINTERRUPTIBLE); } diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index abbee2d15dce..d67b67ae6c8b 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -33,6 +33,8 @@ enum { PG_UPTODATE, /* page group sync bit in read path */ PG_WB_END, /* page group sync bit in write path */ PG_REMOVE, /* page group sync bit in write path */ + PG_CONTENDED1, /* Is someone waiting for a lock? */ + PG_CONTENDED2, /* Is someone waiting for a lock? */ }; struct nfs_inode; -- cgit v1.2.3-71-gd317 From 2b02c20ce0c28974b44e69a2e2f5ddc6a470ad6f Mon Sep 17 00:00:00 2001 From: Enrico Mioso Date: Tue, 11 Jul 2017 17:21:52 +0200 Subject: cdc_ncm: Set NTB format again after altsetting switch for Huawei devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some firmwares in Huawei E3372H devices have been observed to switch back to NTB 32-bit format after altsetting switch. This patch implements a driver flag to check for the device settings and set NTB format to 16-bit again if needed. The flag has been activated for devices controlled by the huawei_cdc_ncm.c driver. V1->V2: - fixed broken error checks - some corrections to the commit message V2->V3: - variable name changes, to clarify what's happening - check (and possibly set) the NTB format later in the common bind code path Signed-off-by: Enrico Mioso Reported-and-tested-by: Christian Panton Reviewed-by: Bjørn Mork CC: Bjørn Mork CC: Christian Panton CC: linux-usb@vger.kernel.org CC: netdev@vger.kernel.org CC: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 28 ++++++++++++++++++++++++++++ drivers/net/usb/huawei_cdc_ncm.c | 6 ++++++ include/linux/usb/cdc_ncm.h | 1 + 3 files changed, 35 insertions(+) (limited to 'include') diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index d103a1d4fb36..8f572b9f3625 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -768,8 +768,10 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ u8 *buf; int len; int temp; + int err; u8 iface_no; struct usb_cdc_parsed_header hdr; + u16 curr_ntb_format; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -874,6 +876,32 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ goto error2; } + /* + * Some Huawei devices have been observed to come out of reset in NDP32 mode. + * Let's check if this is the case, and set the device to NDP16 mode again if + * needed. + */ + if (ctx->drvflags & CDC_NCM_FLAG_RESET_NTB16) { + err = usbnet_read_cmd(dev, USB_CDC_GET_NTB_FORMAT, + USB_TYPE_CLASS | USB_DIR_IN | USB_RECIP_INTERFACE, + 0, iface_no, &curr_ntb_format, 2); + if (err < 0) { + goto error2; + } + + if (curr_ntb_format == USB_CDC_NCM_NTB32_FORMAT) { + dev_info(&intf->dev, "resetting NTB format to 16-bit"); + err = usbnet_write_cmd(dev, USB_CDC_SET_NTB_FORMAT, + USB_TYPE_CLASS | USB_DIR_OUT + | USB_RECIP_INTERFACE, + USB_CDC_NCM_NTB16_FORMAT, + iface_no, NULL, 0); + + if (err < 0) + goto error2; + } + } + cdc_ncm_find_endpoints(dev, ctx->data); cdc_ncm_find_endpoints(dev, ctx->control); if (!dev->in || !dev->out || !dev->status) { diff --git a/drivers/net/usb/huawei_cdc_ncm.c b/drivers/net/usb/huawei_cdc_ncm.c index 2680a65cd5e4..63f28908afda 100644 --- a/drivers/net/usb/huawei_cdc_ncm.c +++ b/drivers/net/usb/huawei_cdc_ncm.c @@ -80,6 +80,12 @@ static int huawei_cdc_ncm_bind(struct usbnet *usbnet_dev, * be at the end of the frame. */ drvflags |= CDC_NCM_FLAG_NDP_TO_END; + + /* Additionally, it has been reported that some Huawei E3372H devices, with + * firmware version 21.318.01.00.541, come out of reset in NTB32 format mode, hence + * needing to be set to the NTB16 one again. + */ + drvflags |= CDC_NCM_FLAG_RESET_NTB16; ret = cdc_ncm_bind_common(usbnet_dev, intf, 1, drvflags); if (ret) goto err; diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index 021f7a88f52c..1a59699cf82a 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -83,6 +83,7 @@ /* Driver flags */ #define CDC_NCM_FLAG_NDP_TO_END 0x02 /* NDP is placed at end of frame */ #define CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE 0x04 /* Avoid altsetting toggle during init */ +#define CDC_NCM_FLAG_RESET_NTB16 0x08 /* set NDP16 one more time after altsetting switch */ #define cdc_ncm_comm_intf_is_mbim(x) ((x)->desc.bInterfaceSubClass == USB_CDC_SUBCLASS_MBIM && \ (x)->desc.bInterfaceProtocol == USB_CDC_PROTO_NONE) -- cgit v1.2.3-71-gd317 From 76250f2b743b72cb685cc51ac0cdabb32957180b Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 14 Feb 2017 12:40:01 +0000 Subject: dma-buf/fence: Avoid use of uninitialised timestamp [ 236.821534] WARNING: kmemcheck: Caught 64-bit read from uninitialized memory (ffff8802538683d0) [ 236.828642] 420000001e7f0000000000000000000000080000000000000000000000000000 [ 236.839543] i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u [ 236.850420] ^ [ 236.854123] RIP: 0010:[] [] fence_signal+0x17/0xd0 [ 236.861313] RSP: 0018:ffff88024acd7ba0 EFLAGS: 00010282 [ 236.865027] RAX: ffffffff812f6a90 RBX: ffff8802527ca800 RCX: ffff880252cb30e0 [ 236.868801] RDX: ffff88024ac5d918 RSI: ffff880252f780e0 RDI: ffff880253868380 [ 236.872579] RBP: ffff88024acd7bc0 R08: ffff88024acd7be0 R09: 0000000000000000 [ 236.876407] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880253868380 [ 236.880185] R13: ffff8802538684d0 R14: ffff880253868380 R15: ffff88024cd48e00 [ 236.883983] FS: 00007f1646d1a740(0000) GS:ffff88025d000000(0000) knlGS:0000000000000000 [ 236.890959] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 236.894702] CR2: ffff880251360318 CR3: 000000024ad21000 CR4: 00000000001406f0 [ 236.898481] [] i915_gem_request_retire+0x1cd/0x230 [ 236.902439] [] i915_gem_request_alloc+0xa3/0x2f0 [ 236.906435] [] i915_gem_do_execbuffer.isra.41+0xb6d/0x18b0 [ 236.910434] [] i915_gem_execbuffer2+0x95/0x1e0 [ 236.914390] [] drm_ioctl+0x1e5/0x460 [ 236.918275] [] do_vfs_ioctl+0x8f/0x5c0 [ 236.922168] [] SyS_ioctl+0x3c/0x70 [ 236.926090] [] entry_SYSCALL_64_fastpath+0x17/0x93 [ 236.930045] [] 0xffffffffffffffff We only set the timestamp before we mark the fence as signaled. It is done before to avoid observers having a window in which they may see the fence as complete but no timestamp. Having it does incur a potential for the timestamp to be written twice, and even for it to be corrupted if the u64 write is not atomic. Instead use a new bit to record the presence of the timestamp, and teach the readers to wait until it is set if the fence is complete. There still remains a race where the timestamp for the signaled fence may be shown before the fence is reported as signaled, but that's a pre-existing error. Signed-off-by: Chris Wilson Cc: Sumit Semwal Cc: Gustavo Padovan Cc: Daniel Vetter Reported-by: Rafael Antognolli Signed-off-by: Gustavo Padovan Link: http://patchwork.freedesktop.org/patch/msgid/20170214124001.1930-1-chris@chris-wilson.co.uk --- drivers/dma-buf/dma-fence.c | 17 ++++++----------- drivers/dma-buf/sync_debug.c | 2 +- drivers/dma-buf/sync_file.c | 8 +++++++- include/linux/dma-fence.h | 2 ++ 4 files changed, 16 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 0918d3f003d6..13556fdda2a5 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -75,11 +75,6 @@ int dma_fence_signal_locked(struct dma_fence *fence) if (WARN_ON(!fence)) return -EINVAL; - if (!ktime_to_ns(fence->timestamp)) { - fence->timestamp = ktime_get(); - smp_mb__before_atomic(); - } - if (test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { ret = -EINVAL; @@ -87,8 +82,11 @@ int dma_fence_signal_locked(struct dma_fence *fence) * we might have raced with the unlocked dma_fence_signal, * still run through all callbacks */ - } else + } else { + fence->timestamp = ktime_get(); + set_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags); trace_dma_fence_signaled(fence); + } list_for_each_entry_safe(cur, tmp, &fence->cb_list, node) { list_del_init(&cur->node); @@ -115,14 +113,11 @@ int dma_fence_signal(struct dma_fence *fence) if (!fence) return -EINVAL; - if (!ktime_to_ns(fence->timestamp)) { - fence->timestamp = ktime_get(); - smp_mb__before_atomic(); - } - if (test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) return -EINVAL; + fence->timestamp = ktime_get(); + set_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags); trace_dma_fence_signaled(fence); if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &fence->flags)) { diff --git a/drivers/dma-buf/sync_debug.c b/drivers/dma-buf/sync_debug.c index c769dc653b34..bfead12390f2 100644 --- a/drivers/dma-buf/sync_debug.c +++ b/drivers/dma-buf/sync_debug.c @@ -84,7 +84,7 @@ static void sync_print_fence(struct seq_file *s, show ? "_" : "", sync_status_str(status)); - if (status) { + if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags)) { struct timespec64 ts64 = ktime_to_timespec64(fence->timestamp); diff --git a/drivers/dma-buf/sync_file.c b/drivers/dma-buf/sync_file.c index 2321035f6204..95f259b719fc 100644 --- a/drivers/dma-buf/sync_file.c +++ b/drivers/dma-buf/sync_file.c @@ -375,7 +375,13 @@ static void sync_fill_fence_info(struct dma_fence *fence, sizeof(info->driver_name)); info->status = dma_fence_get_status(fence); - info->timestamp_ns = ktime_to_ns(fence->timestamp); + while (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags) && + !test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags)) + cpu_relax(); + info->timestamp_ns = + test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags) ? + ktime_to_ns(fence->timestamp) : + ktime_set(0, 0); } static long sync_file_ioctl_fence_info(struct sync_file *sync_file, diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index a5195a7d6f77..0a186c4f3981 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -55,6 +55,7 @@ struct dma_fence_cb; * of the time. * * DMA_FENCE_FLAG_SIGNALED_BIT - fence is already signaled + * DMA_FENCE_FLAG_TIMESTAMP_BIT - timestamp recorded for fence signaling * DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT - enable_signaling might have been called * DMA_FENCE_FLAG_USER_BITS - start of the unused bits, can be used by the * implementer of the fence for its own purposes. Can be used in different @@ -84,6 +85,7 @@ struct dma_fence { enum dma_fence_flag_bits { DMA_FENCE_FLAG_SIGNALED_BIT, + DMA_FENCE_FLAG_TIMESTAMP_BIT, DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, DMA_FENCE_FLAG_USER_BITS, /* must always be last member */ }; -- cgit v1.2.3-71-gd317 From b1f5bfc27a19f214006b9b4db7b9126df2dfdf5a Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 14 Jul 2017 18:32:45 +0200 Subject: sctp: don't dereference ptr before leaving _sctp_walk_{params, errors}() If the length field of the iterator (|pos.p| or |err|) is past the end of the chunk, we shouldn't access it. This bug has been detected by KMSAN. For the following pair of system calls: socket(PF_INET6, SOCK_STREAM, 0x84 /* IPPROTO_??? */) = 3 sendto(3, "A", 1, MSG_OOB, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 1 the tool has reported a use of uninitialized memory: ================================================================== BUG: KMSAN: use of uninitialized memory in sctp_rcv+0x17b8/0x43b0 CPU: 1 PID: 2940 Comm: probe Not tainted 4.11.0-rc5+ #2926 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x172/0x1c0 lib/dump_stack.c:52 kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:927 __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:469 __sctp_rcv_init_lookup net/sctp/input.c:1074 __sctp_rcv_lookup_harder net/sctp/input.c:1233 __sctp_rcv_lookup net/sctp/input.c:1255 sctp_rcv+0x17b8/0x43b0 net/sctp/input.c:170 sctp6_rcv+0x32/0x70 net/sctp/ipv6.c:984 ip6_input_finish+0x82f/0x1ee0 net/ipv6/ip6_input.c:279 NF_HOOK ./include/linux/netfilter.h:257 ip6_input+0x239/0x290 net/ipv6/ip6_input.c:322 dst_input ./include/net/dst.h:492 ip6_rcv_finish net/ipv6/ip6_input.c:69 NF_HOOK ./include/linux/netfilter.h:257 ipv6_rcv+0x1dbd/0x22e0 net/ipv6/ip6_input.c:203 __netif_receive_skb_core+0x2f6f/0x3a20 net/core/dev.c:4208 __netif_receive_skb net/core/dev.c:4246 process_backlog+0x667/0xba0 net/core/dev.c:4866 napi_poll net/core/dev.c:5268 net_rx_action+0xc95/0x1590 net/core/dev.c:5333 __do_softirq+0x485/0x942 kernel/softirq.c:284 do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902 do_softirq kernel/softirq.c:328 __local_bh_enable_ip+0x25b/0x290 kernel/softirq.c:181 local_bh_enable+0x37/0x40 ./include/linux/bottom_half.h:31 rcu_read_unlock_bh ./include/linux/rcupdate.h:931 ip6_finish_output2+0x19b2/0x1cf0 net/ipv6/ip6_output.c:124 ip6_finish_output+0x764/0x970 net/ipv6/ip6_output.c:149 NF_HOOK_COND ./include/linux/netfilter.h:246 ip6_output+0x456/0x520 net/ipv6/ip6_output.c:163 dst_output ./include/net/dst.h:486 NF_HOOK ./include/linux/netfilter.h:257 ip6_xmit+0x1841/0x1c00 net/ipv6/ip6_output.c:261 sctp_v6_xmit+0x3b7/0x470 net/sctp/ipv6.c:225 sctp_packet_transmit+0x38cb/0x3a20 net/sctp/output.c:632 sctp_outq_flush+0xeb3/0x46e0 net/sctp/outqueue.c:885 sctp_outq_uncork+0xb2/0xd0 net/sctp/outqueue.c:750 sctp_side_effects net/sctp/sm_sideeffect.c:1773 sctp_do_sm+0x6962/0x6ec0 net/sctp/sm_sideeffect.c:1147 sctp_primitive_ASSOCIATE+0x12c/0x160 net/sctp/primitive.c:88 sctp_sendmsg+0x43e5/0x4f90 net/sctp/socket.c:1954 inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 SYSC_sendto+0x608/0x710 net/socket.c:1696 SyS_sendto+0x8a/0xb0 net/socket.c:1664 do_syscall_64+0xe6/0x130 arch/x86/entry/common.c:285 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:246 RIP: 0033:0x401133 RSP: 002b:00007fff6d99cd38 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000004002b0 RCX: 0000000000401133 RDX: 0000000000000001 RSI: 0000000000494088 RDI: 0000000000000003 RBP: 00007fff6d99cd90 R08: 00007fff6d99cd50 R09: 000000000000001c R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000000 R13: 00000000004063d0 R14: 0000000000406460 R15: 0000000000000000 origin: save_stack_trace+0x37/0x40 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:302 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:198 kmsan_poison_shadow+0x6d/0xc0 mm/kmsan/kmsan.c:211 slab_alloc_node mm/slub.c:2743 __kmalloc_node_track_caller+0x200/0x360 mm/slub.c:4351 __kmalloc_reserve net/core/skbuff.c:138 __alloc_skb+0x26b/0x840 net/core/skbuff.c:231 alloc_skb ./include/linux/skbuff.h:933 sctp_packet_transmit+0x31e/0x3a20 net/sctp/output.c:570 sctp_outq_flush+0xeb3/0x46e0 net/sctp/outqueue.c:885 sctp_outq_uncork+0xb2/0xd0 net/sctp/outqueue.c:750 sctp_side_effects net/sctp/sm_sideeffect.c:1773 sctp_do_sm+0x6962/0x6ec0 net/sctp/sm_sideeffect.c:1147 sctp_primitive_ASSOCIATE+0x12c/0x160 net/sctp/primitive.c:88 sctp_sendmsg+0x43e5/0x4f90 net/sctp/socket.c:1954 inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 SYSC_sendto+0x608/0x710 net/socket.c:1696 SyS_sendto+0x8a/0xb0 net/socket.c:1664 do_syscall_64+0xe6/0x130 arch/x86/entry/common.c:285 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:246 ================================================================== Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index a9519a06a23b..980807d7506f 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -469,6 +469,8 @@ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member) #define _sctp_walk_params(pos, chunk, end, member)\ for (pos.v = chunk->member;\ + (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <\ + (void *)chunk + end) &&\ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ ntohs(pos.p->length) >= sizeof(struct sctp_paramhdr);\ pos.v += SCTP_PAD4(ntohs(pos.p->length))) @@ -479,6 +481,8 @@ _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length)) #define _sctp_walk_errors(err, chunk_hdr, end)\ for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \ sizeof(struct sctp_chunkhdr));\ + ((void *)err + offsetof(sctp_errhdr_t, length) + sizeof(err->length) <\ + (void *)chunk_hdr + end) &&\ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\ ntohs(err->length) >= sizeof(sctp_errhdr_t); \ err = (sctp_errhdr_t *)((void *)err + SCTP_PAD4(ntohs(err->length)))) -- cgit v1.2.3-71-gd317 From e67ae2b7b23b283e657865b498b151e6a17b919d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 10 Jul 2017 13:17:26 +0200 Subject: libceph: fix old style declaration warnings The new macros don't follow the usual style for declarations, which we get a warning for with 'make W=1': In file included from fs/ceph/mds_client.c:16:0: include/linux/ceph/ceph_features.h:74:1: error: 'static' is not at beginning of declaration [-Werror=old-style-declaration] This moves the 'static' keyword to the front of the declaration. Fixes: f179d3ba8cb9 ("libceph: new features macros") Signed-off-by: Arnd Bergmann Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_features.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index f0f6c537b64c..040dd105c3e7 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -10,14 +10,14 @@ #define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL #define DEFINE_CEPH_FEATURE(bit, incarnation, name) \ - const static uint64_t CEPH_FEATURE_##name = (1ULL< Date: Fri, 14 Jul 2017 18:16:44 +0530 Subject: mmc: host: omap_hsmmc: remove unused platform callbacks Remove unused callbacks in the omap_hsmmc_platform_data structure Signed-off-by: Faiz Abbas Acked-by: Tony Lindgren Signed-off-by: Ulf Hansson --- drivers/mmc/host/omap_hsmmc.c | 11 ----------- include/linux/platform_data/hsmmc-omap.h | 10 ---------- 2 files changed, 21 deletions(-) (limited to 'include') diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 7c12f3715676..04ff3c97a535 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -356,9 +356,6 @@ static int omap_hsmmc_set_power(struct omap_hsmmc_host *host, int power_on, struct mmc_host *mmc = host->mmc; int ret = 0; - if (mmc_pdata(host)->set_power) - return mmc_pdata(host)->set_power(host->dev, power_on, vdd); - /* * If we don't see a Vcc regulator, assume it's a fixed * voltage always-on regulator. @@ -366,9 +363,6 @@ static int omap_hsmmc_set_power(struct omap_hsmmc_host *host, int power_on, if (IS_ERR(mmc->supply.vmmc)) return 0; - if (mmc_pdata(host)->before_set_reg) - mmc_pdata(host)->before_set_reg(host->dev, power_on, vdd); - ret = omap_hsmmc_set_pbias(host, false, 0); if (ret) return ret; @@ -400,9 +394,6 @@ static int omap_hsmmc_set_power(struct omap_hsmmc_host *host, int power_on, return ret; } - if (mmc_pdata(host)->after_set_reg) - mmc_pdata(host)->after_set_reg(host->dev, power_on, vdd); - return 0; err_set_voltage: @@ -469,8 +460,6 @@ static int omap_hsmmc_reg_get(struct omap_hsmmc_host *host) int ret; struct mmc_host *mmc = host->mmc; - if (mmc_pdata(host)->set_power) - return 0; ret = mmc_regulator_get_supply(mmc); if (ret == -EPROBE_DEFER) diff --git a/include/linux/platform_data/hsmmc-omap.h b/include/linux/platform_data/hsmmc-omap.h index 8e981be2e2c2..0ff1e0dba720 100644 --- a/include/linux/platform_data/hsmmc-omap.h +++ b/include/linux/platform_data/hsmmc-omap.h @@ -55,9 +55,6 @@ struct omap_hsmmc_platform_data { u32 caps; /* Used for the MMC driver on 2430 and later */ u32 pm_caps; /* PM capabilities of the mmc */ - /* use the internal clock */ - unsigned internal_clock:1; - /* nonremovable e.g. eMMC */ unsigned nonremovable:1; @@ -73,13 +70,6 @@ struct omap_hsmmc_platform_data { int gpio_cd; /* gpio (card detect) */ int gpio_cod; /* gpio (cover detect) */ int gpio_wp; /* gpio (write protect) */ - - int (*set_power)(struct device *dev, int power_on, int vdd); - void (*remux)(struct device *dev, int power_on); - /* Call back before enabling / disabling regulators */ - void (*before_set_reg)(struct device *dev, int power_on, int vdd); - /* Call back after enabling / disabling regulators */ - void (*after_set_reg)(struct device *dev, int power_on, int vdd); /* if we have special card, init it using this callback */ void (*init_card)(struct mmc_card *card); -- cgit v1.2.3-71-gd317 From c641e5b207ed7dfaa692820aeb5b6dde3de3e9b0 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 12 Jul 2017 17:55:29 +0200 Subject: ASoC: fix pcm-creation regression This reverts commit 99b04f4c4051 ("ASoC: add Component level pcm_new/pcm_free"), which started calling the pcm_new callback for every component in a *card* when creating a new pcm, something which does not seem to make any sense. This specifically led to memory leaks in systems with more than one platform component and where DMA memory is allocated in the platform-driver callback. For example, when both mcasp devices are being used on an am335x board, DMA memory would be allocated twice for every DAI link during probe. When CONFIG_SND_VERBOSE_PROCFS was set this fortunately also led to warnings such as: WARNING: CPU: 0 PID: 565 at ../fs/proc/generic.c:346 proc_register+0x110/0x154 proc_dir_entry 'sub0/prealloc' already registered Since there seems to be no users of the new component callbacks, and the current implementation introduced a regression, let's revert the offending commit for now. Fixes: 99b04f4c4051 ("ASoC: add Component level pcm_new/pcm_free") Signed-off-by: Johan Hovold Reviewed-by: Linus Walleij Tested-by: Linus Walleij Signed-off-by: Mark Brown Cc: stable # 4.10 --- include/sound/soc.h | 6 ------ sound/soc/soc-core.c | 25 ------------------------- sound/soc/soc-pcm.c | 32 +++++++++----------------------- 3 files changed, 9 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 9c94b97c17f8..c4a8b1947566 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -795,10 +795,6 @@ struct snd_soc_component_driver { int (*suspend)(struct snd_soc_component *); int (*resume)(struct snd_soc_component *); - /* pcm creation and destruction */ - int (*pcm_new)(struct snd_soc_pcm_runtime *); - void (*pcm_free)(struct snd_pcm *); - /* DT */ int (*of_xlate_dai_name)(struct snd_soc_component *component, struct of_phandle_args *args, @@ -874,8 +870,6 @@ struct snd_soc_component { void (*remove)(struct snd_soc_component *); int (*suspend)(struct snd_soc_component *); int (*resume)(struct snd_soc_component *); - int (*pcm_new)(struct snd_soc_pcm_runtime *); - void (*pcm_free)(struct snd_pcm *); /* machine specific init */ int (*init)(struct snd_soc_component *component); diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 921622a01944..c240e13ba057 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -3171,8 +3171,6 @@ static int snd_soc_component_initialize(struct snd_soc_component *component, component->remove = component->driver->remove; component->suspend = component->driver->suspend; component->resume = component->driver->resume; - component->pcm_new = component->driver->pcm_new; - component->pcm_free = component->driver->pcm_free; dapm = &component->dapm; dapm->dev = dev; @@ -3360,25 +3358,6 @@ static void snd_soc_platform_drv_remove(struct snd_soc_component *component) platform->driver->remove(platform); } -static int snd_soc_platform_drv_pcm_new(struct snd_soc_pcm_runtime *rtd) -{ - struct snd_soc_platform *platform = rtd->platform; - - if (platform->driver->pcm_new) - return platform->driver->pcm_new(rtd); - else - return 0; -} - -static void snd_soc_platform_drv_pcm_free(struct snd_pcm *pcm) -{ - struct snd_soc_pcm_runtime *rtd = pcm->private_data; - struct snd_soc_platform *platform = rtd->platform; - - if (platform->driver->pcm_free) - platform->driver->pcm_free(pcm); -} - /** * snd_soc_add_platform - Add a platform to the ASoC core * @dev: The parent device for the platform @@ -3402,10 +3381,6 @@ int snd_soc_add_platform(struct device *dev, struct snd_soc_platform *platform, platform->component.probe = snd_soc_platform_drv_probe; if (platform_drv->remove) platform->component.remove = snd_soc_platform_drv_remove; - if (platform_drv->pcm_new) - platform->component.pcm_new = snd_soc_platform_drv_pcm_new; - if (platform_drv->pcm_free) - platform->component.pcm_free = snd_soc_platform_drv_pcm_free; #ifdef CONFIG_DEBUG_FS platform->component.debugfs_prefix = "platform"; diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index dcc5ece08668..553f7a76743c 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -2628,25 +2628,12 @@ static int dpcm_fe_dai_close(struct snd_pcm_substream *fe_substream) return ret; } -static void soc_pcm_free(struct snd_pcm *pcm) -{ - struct snd_soc_pcm_runtime *rtd = pcm->private_data; - struct snd_soc_component *component; - - list_for_each_entry(component, &rtd->card->component_dev_list, - card_list) { - if (component->pcm_free) - component->pcm_free(pcm); - } -} - /* create a new pcm */ int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num) { struct snd_soc_platform *platform = rtd->platform; struct snd_soc_dai *codec_dai; struct snd_soc_dai *cpu_dai = rtd->cpu_dai; - struct snd_soc_component *component; struct snd_pcm *pcm; char new_name[64]; int ret = 0, playback = 0, capture = 0; @@ -2756,18 +2743,17 @@ int soc_new_pcm(struct snd_soc_pcm_runtime *rtd, int num) if (capture) snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, &rtd->ops); - list_for_each_entry(component, &rtd->card->component_dev_list, card_list) { - if (component->pcm_new) { - ret = component->pcm_new(rtd); - if (ret < 0) { - dev_err(component->dev, - "ASoC: pcm constructor failed: %d\n", - ret); - return ret; - } + if (platform->driver->pcm_new) { + ret = platform->driver->pcm_new(rtd); + if (ret < 0) { + dev_err(platform->dev, + "ASoC: pcm constructor failed: %d\n", + ret); + return ret; } } - pcm->private_free = soc_pcm_free; + + pcm->private_free = platform->driver->pcm_free; out: dev_info(rtd->card->dev, "%s <-> %s mapping ok\n", (rtd->num_codecs > 1) ? "multicodec" : rtd->codec_dai->name, -- cgit v1.2.3-71-gd317 From cf56c2f892a8a1870a8358114ad896772da7543a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 6 Jul 2017 23:17:44 +0200 Subject: netfilter: remove old pre-netns era hook api no more users in the tree, remove this. The old api is racy wrt. module removal, all users have been converted to the netns-aware api. The old api pretended we still have global hooks but that has not been true for a long time. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 9 --- net/netfilter/core.c | 143 ---------------------------------------------- 2 files changed, 152 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index a4b97be30b28..22f081065d49 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -61,8 +61,6 @@ typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); struct nf_hook_ops { - struct list_head list; - /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; @@ -160,13 +158,6 @@ int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); -int nf_register_hook(struct nf_hook_ops *reg); -void nf_unregister_hook(struct nf_hook_ops *reg); -int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); -void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); -int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); -void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); - /* Functions to register get/setsockopt ranges (non-inclusive). You need to check permissions yourself! */ int nf_register_sockopt(struct nf_sockopt_ops *reg); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 552d606e57ca..368610dbc3c0 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -227,114 +227,6 @@ void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, } EXPORT_SYMBOL(nf_unregister_net_hooks); -static LIST_HEAD(nf_hook_list); - -static int _nf_register_hook(struct nf_hook_ops *reg) -{ - struct net *net, *last; - int ret; - - for_each_net(net) { - ret = nf_register_net_hook(net, reg); - if (ret && ret != -ENOENT) - goto rollback; - } - list_add_tail(®->list, &nf_hook_list); - - return 0; -rollback: - last = net; - for_each_net(net) { - if (net == last) - break; - nf_unregister_net_hook(net, reg); - } - return ret; -} - -int nf_register_hook(struct nf_hook_ops *reg) -{ - int ret; - - rtnl_lock(); - ret = _nf_register_hook(reg); - rtnl_unlock(); - - return ret; -} -EXPORT_SYMBOL(nf_register_hook); - -static void _nf_unregister_hook(struct nf_hook_ops *reg) -{ - struct net *net; - - list_del(®->list); - for_each_net(net) - nf_unregister_net_hook(net, reg); -} - -void nf_unregister_hook(struct nf_hook_ops *reg) -{ - rtnl_lock(); - _nf_unregister_hook(reg); - rtnl_unlock(); -} -EXPORT_SYMBOL(nf_unregister_hook); - -int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - unsigned int i; - int err = 0; - - for (i = 0; i < n; i++) { - err = nf_register_hook(®[i]); - if (err) - goto err; - } - return err; - -err: - if (i > 0) - nf_unregister_hooks(reg, i); - return err; -} -EXPORT_SYMBOL(nf_register_hooks); - -/* Caller MUST take rtnl_lock() */ -int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - unsigned int i; - int err = 0; - - for (i = 0; i < n; i++) { - err = _nf_register_hook(®[i]); - if (err) - goto err; - } - return err; - -err: - if (i > 0) - _nf_unregister_hooks(reg, i); - return err; -} -EXPORT_SYMBOL(_nf_register_hooks); - -void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - while (n-- > 0) - nf_unregister_hook(®[n]); -} -EXPORT_SYMBOL(nf_unregister_hooks); - -/* Caller MUST take rtnl_lock */ -void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - while (n-- > 0) - _nf_unregister_hook(®[n]); -} -EXPORT_SYMBOL(_nf_unregister_hooks); - /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, @@ -450,37 +342,6 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif -static int nf_register_hook_list(struct net *net) -{ - struct nf_hook_ops *elem; - int ret; - - rtnl_lock(); - list_for_each_entry(elem, &nf_hook_list, list) { - ret = nf_register_net_hook(net, elem); - if (ret && ret != -ENOENT) - goto out_undo; - } - rtnl_unlock(); - return 0; - -out_undo: - list_for_each_entry_continue_reverse(elem, &nf_hook_list, list) - nf_unregister_net_hook(net, elem); - rtnl_unlock(); - return ret; -} - -static void nf_unregister_hook_list(struct net *net) -{ - struct nf_hook_ops *elem; - - rtnl_lock(); - list_for_each_entry(elem, &nf_hook_list, list) - nf_unregister_net_hook(net, elem); - rtnl_unlock(); -} - static int __net_init netfilter_net_init(struct net *net) { int i, h, ret; @@ -500,16 +361,12 @@ static int __net_init netfilter_net_init(struct net *net) return -ENOMEM; } #endif - ret = nf_register_hook_list(net); - if (ret) - remove_proc_entry("netfilter", net->proc_net); return ret; } static void __net_exit netfilter_net_exit(struct net *net) { - nf_unregister_hook_list(net); remove_proc_entry("netfilter", net->proc_net); } -- cgit v1.2.3-71-gd317 From c6325179238f1d4683edbec53d8322575d76d7e2 Mon Sep 17 00:00:00 2001 From: Gleb Fotengauer-Malinovskiy Date: Mon, 17 Jul 2017 16:29:46 +0300 Subject: tty: Fix TIOCGPTPEER ioctl definition This ioctl does nothing to justify an _IOC_READ or _IOC_WRITE flag because it doesn't copy anything from/to userspace to access the argument. Fixes: 54ebbfb16034 ("tty: add TIOCGPTPEER ioctl") Signed-off-by: Gleb Fotengauer-Malinovskiy Acked-by: Aleksa Sarai Acked-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- arch/alpha/include/uapi/asm/ioctls.h | 2 +- arch/mips/include/uapi/asm/ioctls.h | 2 +- arch/parisc/include/uapi/asm/ioctls.h | 2 +- arch/powerpc/include/uapi/asm/ioctls.h | 2 +- arch/sh/include/uapi/asm/ioctls.h | 2 +- arch/sparc/include/uapi/asm/ioctls.h | 2 +- arch/xtensa/include/uapi/asm/ioctls.h | 2 +- include/uapi/asm-generic/ioctls.h | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/uapi/asm/ioctls.h b/arch/alpha/include/uapi/asm/ioctls.h index ff67b8373bf7..1cd7dc7d4870 100644 --- a/arch/alpha/include/uapi/asm/ioctls.h +++ b/arch/alpha/include/uapi/asm/ioctls.h @@ -100,7 +100,7 @@ #define TIOCGPKT _IOR('T', 0x38, int) /* Get packet mode state */ #define TIOCGPTLCK _IOR('T', 0x39, int) /* Get Pty lock state */ #define TIOCGEXCL _IOR('T', 0x40, int) /* Get exclusive mode state */ -#define TIOCGPTPEER _IOR('T', 0x41, int) /* Safely open the slave */ +#define TIOCGPTPEER _IO('T', 0x41) /* Safely open the slave */ #define TIOCSERCONFIG 0x5453 #define TIOCSERGWILD 0x5454 diff --git a/arch/mips/include/uapi/asm/ioctls.h b/arch/mips/include/uapi/asm/ioctls.h index 68e19b689a00..1609cb0907ac 100644 --- a/arch/mips/include/uapi/asm/ioctls.h +++ b/arch/mips/include/uapi/asm/ioctls.h @@ -91,7 +91,7 @@ #define TIOCGPKT _IOR('T', 0x38, int) /* Get packet mode state */ #define TIOCGPTLCK _IOR('T', 0x39, int) /* Get Pty lock state */ #define TIOCGEXCL _IOR('T', 0x40, int) /* Get exclusive mode state */ -#define TIOCGPTPEER _IOR('T', 0x41, int) /* Safely open the slave */ +#define TIOCGPTPEER _IO('T', 0x41) /* Safely open the slave */ /* I hope the range from 0x5480 on is free ... */ #define TIOCSCTTY 0x5480 /* become controlling tty */ diff --git a/arch/parisc/include/uapi/asm/ioctls.h b/arch/parisc/include/uapi/asm/ioctls.h index 674c68a5bbd0..d0e3321403be 100644 --- a/arch/parisc/include/uapi/asm/ioctls.h +++ b/arch/parisc/include/uapi/asm/ioctls.h @@ -60,7 +60,7 @@ #define TIOCGPKT _IOR('T', 0x38, int) /* Get packet mode state */ #define TIOCGPTLCK _IOR('T', 0x39, int) /* Get Pty lock state */ #define TIOCGEXCL _IOR('T', 0x40, int) /* Get exclusive mode state */ -#define TIOCGPTPEER _IOR('T', 0x41, int) /* Safely open the slave */ +#define TIOCGPTPEER _IO('T', 0x41) /* Safely open the slave */ #define FIONCLEX 0x5450 /* these numbers need to be adjusted. */ #define FIOCLEX 0x5451 diff --git a/arch/powerpc/include/uapi/asm/ioctls.h b/arch/powerpc/include/uapi/asm/ioctls.h index bfd609a3e928..e3b10469f787 100644 --- a/arch/powerpc/include/uapi/asm/ioctls.h +++ b/arch/powerpc/include/uapi/asm/ioctls.h @@ -100,7 +100,7 @@ #define TIOCGPKT _IOR('T', 0x38, int) /* Get packet mode state */ #define TIOCGPTLCK _IOR('T', 0x39, int) /* Get Pty lock state */ #define TIOCGEXCL _IOR('T', 0x40, int) /* Get exclusive mode state */ -#define TIOCGPTPEER _IOR('T', 0x41, int) /* Safely open the slave */ +#define TIOCGPTPEER _IO('T', 0x41) /* Safely open the slave */ #define TIOCSERCONFIG 0x5453 #define TIOCSERGWILD 0x5454 diff --git a/arch/sh/include/uapi/asm/ioctls.h b/arch/sh/include/uapi/asm/ioctls.h index eec7901e9e65..787bac9f67da 100644 --- a/arch/sh/include/uapi/asm/ioctls.h +++ b/arch/sh/include/uapi/asm/ioctls.h @@ -93,7 +93,7 @@ #define TIOCGPKT _IOR('T', 0x38, int) /* Get packet mode state */ #define TIOCGPTLCK _IOR('T', 0x39, int) /* Get Pty lock state */ #define TIOCGEXCL _IOR('T', 0x40, int) /* Get exclusive mode state */ -#define TIOCGPTPEER _IOR('T', 0x41, int) /* Safely open the slave */ +#define TIOCGPTPEER _IO('T', 0x41) /* Safely open the slave */ #define TIOCSERCONFIG _IO('T', 83) /* 0x5453 */ #define TIOCSERGWILD _IOR('T', 84, int) /* 0x5454 */ diff --git a/arch/sparc/include/uapi/asm/ioctls.h b/arch/sparc/include/uapi/asm/ioctls.h index 6d27398632ea..f5df72b93bb2 100644 --- a/arch/sparc/include/uapi/asm/ioctls.h +++ b/arch/sparc/include/uapi/asm/ioctls.h @@ -88,7 +88,7 @@ #define TIOCGPTN _IOR('t', 134, unsigned int) /* Get Pty Number */ #define TIOCSPTLCK _IOW('t', 135, int) /* Lock/unlock PTY */ #define TIOCSIG _IOW('t', 136, int) /* Generate signal on Pty slave */ -#define TIOCGPTPEER _IOR('t', 137, int) /* Safely open the slave */ +#define TIOCGPTPEER _IO('t', 137) /* Safely open the slave */ /* Little f */ #define FIOCLEX _IO('f', 1) diff --git a/arch/xtensa/include/uapi/asm/ioctls.h b/arch/xtensa/include/uapi/asm/ioctls.h index 98b004e24e85..47d82c09be7b 100644 --- a/arch/xtensa/include/uapi/asm/ioctls.h +++ b/arch/xtensa/include/uapi/asm/ioctls.h @@ -105,7 +105,7 @@ #define TIOCGPKT _IOR('T', 0x38, int) /* Get packet mode state */ #define TIOCGPTLCK _IOR('T', 0x39, int) /* Get Pty lock state */ #define TIOCGEXCL _IOR('T', 0x40, int) /* Get exclusive mode state */ -#define TIOCGPTPEER _IOR('T', 0x41, int) /* Safely open the slave */ +#define TIOCGPTPEER _IO('T', 0x41) /* Safely open the slave */ #define TIOCSERCONFIG _IO('T', 83) #define TIOCSERGWILD _IOR('T', 84, int) diff --git a/include/uapi/asm-generic/ioctls.h b/include/uapi/asm-generic/ioctls.h index 06d5f7ddf84e..14baf9f23a14 100644 --- a/include/uapi/asm-generic/ioctls.h +++ b/include/uapi/asm-generic/ioctls.h @@ -77,7 +77,7 @@ #define TIOCGPKT _IOR('T', 0x38, int) /* Get packet mode state */ #define TIOCGPTLCK _IOR('T', 0x39, int) /* Get Pty lock state */ #define TIOCGEXCL _IOR('T', 0x40, int) /* Get exclusive mode state */ -#define TIOCGPTPEER _IOR('T', 0x41, int) /* Safely open the slave */ +#define TIOCGPTPEER _IO('T', 0x41) /* Safely open the slave */ #define FIONCLEX 0x5450 #define FIOCLEX 0x5451 -- cgit v1.2.3-71-gd317 From 13c401f33e19c20431d9888a91d9ea82e5133bd9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 14 Jul 2017 23:03:49 -0700 Subject: jhash: fix -Wimplicit-fallthrough warnings GCC 7 added a new -Wimplicit-fallthrough warning. It's only enabled with W=1, but since linux/jhash.h is included in over hundred places (including other global headers) it seems worthwhile fixing this warning. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/jhash.h | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/jhash.h b/include/linux/jhash.h index 348c6f47e4cc..8037850f3104 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -85,19 +85,18 @@ static inline u32 jhash(const void *key, u32 length, u32 initval) k += 12; } /* Last block: affect all 32 bits of (c) */ - /* All the case statements fall through */ switch (length) { - case 12: c += (u32)k[11]<<24; - case 11: c += (u32)k[10]<<16; - case 10: c += (u32)k[9]<<8; - case 9: c += k[8]; - case 8: b += (u32)k[7]<<24; - case 7: b += (u32)k[6]<<16; - case 6: b += (u32)k[5]<<8; - case 5: b += k[4]; - case 4: a += (u32)k[3]<<24; - case 3: a += (u32)k[2]<<16; - case 2: a += (u32)k[1]<<8; + case 12: c += (u32)k[11]<<24; /* fall through */ + case 11: c += (u32)k[10]<<16; /* fall through */ + case 10: c += (u32)k[9]<<8; /* fall through */ + case 9: c += k[8]; /* fall through */ + case 8: b += (u32)k[7]<<24; /* fall through */ + case 7: b += (u32)k[6]<<16; /* fall through */ + case 6: b += (u32)k[5]<<8; /* fall through */ + case 5: b += k[4]; /* fall through */ + case 4: a += (u32)k[3]<<24; /* fall through */ + case 3: a += (u32)k[2]<<16; /* fall through */ + case 2: a += (u32)k[1]<<8; /* fall through */ case 1: a += k[0]; __jhash_final(a, b, c); case 0: /* Nothing left to add */ @@ -131,10 +130,10 @@ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) k += 3; } - /* Handle the last 3 u32's: all the case statements fall through */ + /* Handle the last 3 u32's */ switch (length) { - case 3: c += k[2]; - case 2: b += k[1]; + case 3: c += k[2]; /* fall through */ + case 2: b += k[1]; /* fall through */ case 1: a += k[0]; __jhash_final(a, b, c); case 0: /* Nothing left to add */ -- cgit v1.2.3-71-gd317 From df39a9f106d53532443a804352894480ca6ca5fd Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 17 Jul 2017 11:42:55 -0700 Subject: bpf: check NULL for sk_to_full_sk() return value When req->rsk_listener is NULL, sk_to_full_sk() returns NULL too, so we have to check its return value against NULL here. Fixes: 40304b2a1567 ("bpf: BPF support for sock_ops") Reported-by: David Ahern Tested-by: David Ahern Cc: Lawrence Brakmo Cc: Daniel Borkmann Signed-off-by: Cong Wang Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 360c082e885c..d41d40ac3efd 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -85,7 +85,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ - if (sk_fullsock(__sk)) \ + if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ sock_ops, \ BPF_CGROUP_SOCK_OPS); \ -- cgit v1.2.3-71-gd317 From a512c2fbef9c700ee1ee0e045b75e140fef8f5ee Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 23 May 2017 11:26:08 +0300 Subject: IB/core: Introduce modify QP operation with udata This patch adds new function ib_modify_qp_with_udata so that uverbs layer can avoid handling L2 mac address at verbs layer and depend on the core layer to resolve the mac address consistently for all required QPs. Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 32 ++++++++++++++++++++++++-------- include/rdma/ib_verbs.h | 16 ++++++++++++++++ 2 files changed, 40 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 644fa0d13f02..7f8fe443df46 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1276,20 +1276,36 @@ out: } EXPORT_SYMBOL(ib_resolve_eth_dmac); -int ib_modify_qp(struct ib_qp *qp, - struct ib_qp_attr *qp_attr, - int qp_attr_mask) +/** + * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. + * @qp: The QP to modify. + * @attr: On input, specifies the QP attributes to modify. On output, + * the current values of selected QP attributes are returned. + * @attr_mask: A bit-mask used to specify which attributes of the QP + * are being modified. + * @udata: pointer to user's input output buffer information + * are being modified. + * It returns 0 on success and returns appropriate error code on error. + */ +int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) { + int ret; - if (qp_attr_mask & IB_QP_AV) { - int ret; - - ret = ib_resolve_eth_dmac(qp->device, &qp_attr->ah_attr); + if (attr_mask & IB_QP_AV) { + ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); if (ret) return ret; } + return ib_security_modify_qp(qp, attr, attr_mask, udata); +} +EXPORT_SYMBOL(ib_modify_qp_with_udata); - return ib_security_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); +int ib_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask) +{ + return ib_modify_qp_with_udata(qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 356953d3dbd1..6e62c9e2c971 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2947,6 +2947,22 @@ static inline int ib_post_srq_recv(struct ib_srq *srq, struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr); +/** + * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. + * @qp: The QP to modify. + * @attr: On input, specifies the QP attributes to modify. On output, + * the current values of selected QP attributes are returned. + * @attr_mask: A bit-mask used to specify which attributes of the QP + * are being modified. + * @udata: pointer to user's input output buffer information + * are being modified. + * It returns 0 on success and returns appropriate error code on error. + */ +int ib_modify_qp_with_udata(struct ib_qp *qp, + struct ib_qp_attr *attr, + int attr_mask, + struct ib_udata *udata); + /** * ib_modify_qp - Modifies the attributes for the specified QP and then * transitions the QP to the given state. -- cgit v1.2.3-71-gd317 From 0f4d027c3b4240ecb314daa948238d459fdc3a00 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 23 May 2017 14:38:14 +0300 Subject: IB/{rdmavt, qib, hfi1}: Remove gfp flags argument The caller to the driver marks GFP_NOIO allocations with help of memalloc_noio-* calls now. This makes redundant to pass down to the driver gfp flags, which can be GFP_KERNEL only. The patch removes the gfp flags argument and updates all driver paths. Signed-off-by: Leon Romanovsky Signed-off-by: Leon Romanovsky Acked-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 7 +++-- drivers/infiniband/hw/hfi1/qp.h | 3 +-- drivers/infiniband/hw/qib/qib_qp.c | 15 +++++------ drivers/infiniband/hw/qib/qib_verbs.h | 4 +-- drivers/infiniband/sw/rdmavt/qp.c | 48 ++++++++++------------------------- include/rdma/rdma_vt.h | 5 ++-- 6 files changed, 28 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 650305cc0373..1a7af9f60c13 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -647,18 +647,17 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter) qp->pid); } -void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, - gfp_t gfp) +void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) { struct hfi1_qp_priv *priv; - priv = kzalloc_node(sizeof(*priv), gfp, rdi->dparms.node); + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, rdi->dparms.node); if (!priv) return ERR_PTR(-ENOMEM); priv->owner = qp; - priv->s_ahg = kzalloc_node(sizeof(*priv->s_ahg), gfp, + priv->s_ahg = kzalloc_node(sizeof(*priv->s_ahg), GFP_KERNEL, rdi->dparms.node); if (!priv->s_ahg) { kfree(priv); diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index 1eb9cd7b8c19..6fe542b6a927 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -123,8 +123,7 @@ void hfi1_migrate_qp(struct rvt_qp *qp); /* * Functions provided by hfi1 driver for rdmavt to use */ -void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, - gfp_t gfp); +void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp); void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); unsigned free_all_qps(struct rvt_dev_info *rdi); void notify_qp_reset(struct rvt_qp *qp); diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 5984981e7dd4..a343e3b5d4cb 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -104,10 +104,9 @@ const struct rvt_operation_params qib_post_parms[RVT_OPERATION_MAX] = { }; -static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map, - gfp_t gfp) +static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map) { - unsigned long page = get_zeroed_page(gfp); + unsigned long page = get_zeroed_page(GFP_KERNEL); /* * Free the page if someone raced with us installing it. @@ -126,7 +125,7 @@ static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map, * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI. */ int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, - enum ib_qp_type type, u8 port, gfp_t gfp) + enum ib_qp_type type, u8 port) { u32 i, offset, max_scan, qpn; struct rvt_qpn_map *map; @@ -160,7 +159,7 @@ int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, max_scan = qpt->nmaps - !offset; for (i = 0;;) { if (unlikely(!map->page)) { - get_map_page(qpt, map, gfp); + get_map_page(qpt, map); if (unlikely(!map->page)) break; } @@ -317,16 +316,16 @@ u32 qib_mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu) return ib_mtu_enum_to_int(pmtu); } -void *qib_qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, gfp_t gfp) +void *qib_qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) { struct qib_qp_priv *priv; - priv = kzalloc(sizeof(*priv), gfp); + priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return ERR_PTR(-ENOMEM); priv->owner = qp; - priv->s_hdr = kzalloc(sizeof(*priv->s_hdr), gfp); + priv->s_hdr = kzalloc(sizeof(*priv->s_hdr), GFP_KERNEL); if (!priv->s_hdr) { kfree(priv); return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index da0db5485ddc..a52fc67b40d7 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -274,11 +274,11 @@ int qib_get_counters(struct qib_pportdata *ppd, * Functions provided by qib driver for rdmavt to use */ unsigned qib_free_all_qps(struct rvt_dev_info *rdi); -void *qib_qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, gfp_t gfp); +void *qib_qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp); void qib_qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); void qib_notify_qp_reset(struct rvt_qp *qp); int qib_alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, - enum ib_qp_type type, u8 port, gfp_t gfp); + enum ib_qp_type type, u8 port); void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 727e81cc2c8f..459865439a0b 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -118,10 +118,9 @@ const int ib_rvt_state_ops[IB_QPS_ERR + 1] = { EXPORT_SYMBOL(ib_rvt_state_ops); static void get_map_page(struct rvt_qpn_table *qpt, - struct rvt_qpn_map *map, - gfp_t gfp) + struct rvt_qpn_map *map) { - unsigned long page = get_zeroed_page(gfp); + unsigned long page = get_zeroed_page(GFP_KERNEL); /* * Free the page if someone raced with us installing it. @@ -173,7 +172,7 @@ static int init_qpn_table(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt) rdi->dparms.qpn_res_start, rdi->dparms.qpn_res_end); for (i = rdi->dparms.qpn_res_start; i <= rdi->dparms.qpn_res_end; i++) { if (!map->page) { - get_map_page(qpt, map, GFP_KERNEL); + get_map_page(qpt, map); if (!map->page) { ret = -ENOMEM; break; @@ -342,14 +341,14 @@ static inline unsigned mk_qpn(struct rvt_qpn_table *qpt, * Return: The queue pair number */ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, - enum ib_qp_type type, u8 port_num, gfp_t gfp) + enum ib_qp_type type, u8 port_num) { u32 i, offset, max_scan, qpn; struct rvt_qpn_map *map; u32 ret; if (rdi->driver_f.alloc_qpn) - return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num, gfp); + return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num); if (type == IB_QPT_SMI || type == IB_QPT_GSI) { unsigned n; @@ -374,7 +373,7 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, max_scan = qpt->nmaps - !offset; for (i = 0;;) { if (unlikely(!map->page)) { - get_map_page(qpt, map, gfp); + get_map_page(qpt, map); if (unlikely(!map->page)) break; } @@ -672,7 +671,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, struct ib_qp *ret = ERR_PTR(-ENOMEM); struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device); void *priv = NULL; - gfp_t gfp; size_t sqsize; if (!rdi) @@ -680,18 +678,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, if (init_attr->cap.max_send_sge > rdi->dparms.props.max_sge || init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr || - init_attr->create_flags & ~(IB_QP_CREATE_USE_GFP_NOIO)) + init_attr->create_flags) return ERR_PTR(-EINVAL); - /* GFP_NOIO is applicable to RC QP's only */ - - if (init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO && - init_attr->qp_type != IB_QPT_RC) - return ERR_PTR(-EINVAL); - - gfp = init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO ? - GFP_NOIO : GFP_KERNEL; - /* Check receive queue parameters if no SRQ is specified. */ if (!init_attr->srq) { if (init_attr->cap.max_recv_sge > rdi->dparms.props.max_sge || @@ -719,14 +708,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, sz = sizeof(struct rvt_sge) * init_attr->cap.max_send_sge + sizeof(struct rvt_swqe); - if (gfp == GFP_NOIO) - swq = __vmalloc( - sqsize * sz, - gfp | __GFP_ZERO, PAGE_KERNEL); - else - swq = vzalloc_node( - sqsize * sz, - rdi->dparms.node); + swq = vzalloc_node(sqsize * sz, rdi->dparms.node); if (!swq) return ERR_PTR(-ENOMEM); @@ -741,7 +723,8 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, } else if (init_attr->cap.max_recv_sge > 1) sg_list_sz = sizeof(*qp->r_sg_list) * (init_attr->cap.max_recv_sge - 1); - qp = kzalloc_node(sz + sg_list_sz, gfp, rdi->dparms.node); + qp = kzalloc_node(sz + sg_list_sz, GFP_KERNEL, + rdi->dparms.node); if (!qp) goto bail_swq; @@ -751,7 +734,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, kzalloc_node( sizeof(*qp->s_ack_queue) * rvt_max_atomic(rdi), - gfp, + GFP_KERNEL, rdi->dparms.node); if (!qp->s_ack_queue) goto bail_qp; @@ -766,7 +749,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, * Driver needs to set up it's private QP structure and do any * initialization that is needed. */ - priv = rdi->driver_f.qp_priv_alloc(rdi, qp, gfp); + priv = rdi->driver_f.qp_priv_alloc(rdi, qp); if (IS_ERR(priv)) { ret = priv; goto bail_qp; @@ -786,11 +769,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, qp->r_rq.wq = vmalloc_user( sizeof(struct rvt_rwq) + qp->r_rq.size * sz); - else if (gfp == GFP_NOIO) - qp->r_rq.wq = __vmalloc( - sizeof(struct rvt_rwq) + - qp->r_rq.size * sz, - gfp | __GFP_ZERO, PAGE_KERNEL); else qp->r_rq.wq = vzalloc_node( sizeof(struct rvt_rwq) + @@ -824,7 +802,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table, init_attr->qp_type, - init_attr->port_num, gfp); + init_attr->port_num); if (err < 0) { ret = ERR_PTR(err); goto bail_rq_wq; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 4878aaf7bdff..55af69271053 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -229,8 +229,7 @@ struct rvt_driver_provided { * ERR_PTR(err). The driver is free to return NULL or a valid * pointer. */ - void * (*qp_priv_alloc)(struct rvt_dev_info *rdi, struct rvt_qp *qp, - gfp_t gfp); + void * (*qp_priv_alloc)(struct rvt_dev_info *rdi, struct rvt_qp *qp); /* * Free the driver's private qp structure. @@ -319,7 +318,7 @@ struct rvt_driver_provided { /* Let the driver pick the next queue pair number*/ int (*alloc_qpn)(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, - enum ib_qp_type type, u8 port_num, gfp_t gfp); + enum ib_qp_type type, u8 port_num); /* Determine if its safe or allowed to modify the qp */ int (*check_modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr, -- cgit v1.2.3-71-gd317 From 8900b894e769dd88b53e519e3502e0e3c349fe95 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 23 May 2017 14:38:15 +0300 Subject: {net, IB}/mlx4: Remove gfp flags argument The caller to the driver marks GFP_NOIO allocations with help of memalloc_noio-* calls now. This makes redundant to pass down to the driver gfp flags, which can be GFP_KERNEL only. The patch removes the gfp flags argument and updates all driver paths. Signed-off-by: Leon Romanovsky Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/cq.c | 6 ++-- drivers/infiniband/hw/mlx4/mlx4_ib.h | 1 - drivers/infiniband/hw/mlx4/qp.c | 40 +++++++++------------- drivers/infiniband/hw/mlx4/srq.c | 8 ++--- drivers/net/ethernet/mellanox/mlx4/alloc.c | 29 ++++++++-------- drivers/net/ethernet/mellanox/mlx4/cq.c | 4 +-- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 7 ++-- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 2 +- drivers/net/ethernet/mellanox/mlx4/icm.c | 7 ++-- drivers/net/ethernet/mellanox/mlx4/icm.h | 3 +- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 4 +-- drivers/net/ethernet/mellanox/mlx4/mr.c | 17 +++++---- drivers/net/ethernet/mellanox/mlx4/qp.c | 20 +++++------ .../net/ethernet/mellanox/mlx4/resource_tracker.c | 4 +-- drivers/net/ethernet/mellanox/mlx4/srq.c | 4 +-- include/linux/mlx4/device.h | 10 +++--- 16 files changed, 76 insertions(+), 90 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 4f5a143fc0a7..ff931c580557 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -102,7 +102,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * int err; err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size, - PAGE_SIZE * 2, &buf->buf, GFP_KERNEL); + PAGE_SIZE * 2, &buf->buf); if (err) goto out; @@ -113,7 +113,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL); + err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf); if (err) goto err_mtt; @@ -219,7 +219,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, uar = &to_mucontext(context)->uar; } else { - err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL); + err = mlx4_db_alloc(dev->dev, &cq->db, 1); if (err) goto err_cq; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index c2b9cbf4da05..9db82e67e959 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -185,7 +185,6 @@ enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, - MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO, /* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */ MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI, diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 996e9058e515..75c0e6c5dd56 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -634,8 +634,8 @@ static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev, static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp, - gfp_t gfp) + struct ib_udata *udata, int sqpn, + struct mlx4_ib_qp **caller_qp) { int qpn; int err; @@ -691,14 +691,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI || (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) { - sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp); + sqp = kzalloc(sizeof(struct mlx4_ib_sqp), GFP_KERNEL); if (!sqp) return -ENOMEM; qp = &sqp->qp; qp->pri.vid = 0xFFFF; qp->alt.vid = 0xFFFF; } else { - qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp); + qp = kzalloc(sizeof(struct mlx4_ib_qp), GFP_KERNEL); if (!qp) return -ENOMEM; qp->pri.vid = 0xFFFF; @@ -780,7 +780,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err; if (qp_has_rq(init_attr)) { - err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp); + err = mlx4_db_alloc(dev->dev, &qp->db, 0); if (err) goto err; @@ -788,7 +788,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, } if (mlx4_buf_alloc(dev->dev, qp->buf_size, qp->buf_size, - &qp->buf, gfp)) { + &qp->buf)) { memcpy(&init_attr->cap, &backup_cap, sizeof(backup_cap)); err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, @@ -797,7 +797,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err_db; if (mlx4_buf_alloc(dev->dev, qp->buf_size, - PAGE_SIZE * 2, &qp->buf, gfp)) { + PAGE_SIZE * 2, &qp->buf)) { err = -ENOMEM; goto err_db; } @@ -808,20 +808,20 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp); + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); if (err) goto err_mtt; qp->sq.wrid = kmalloc_array(qp->sq.wqe_cnt, sizeof(u64), - gfp | __GFP_NOWARN); + GFP_KERNEL | __GFP_NOWARN); if (!qp->sq.wrid) qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), - gfp, PAGE_KERNEL); + GFP_KERNEL, PAGE_KERNEL); qp->rq.wrid = kmalloc_array(qp->rq.wqe_cnt, sizeof(u64), - gfp | __GFP_NOWARN); + GFP_KERNEL | __GFP_NOWARN); if (!qp->rq.wrid) qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), - gfp, PAGE_KERNEL); + GFP_KERNEL, PAGE_KERNEL); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -859,7 +859,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; - err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp); + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); if (err) goto err_qpn; @@ -1127,10 +1127,7 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, int err; int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; u16 xrcdn = 0; - gfp_t gfp; - gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ? - GFP_NOIO : GFP_KERNEL; /* * We only support LSO, vendor flag1, and multicast loopback blocking, * and only for kernel UD QPs. @@ -1140,8 +1137,7 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP | MLX4_IB_QP_NETIF | - MLX4_IB_QP_CREATE_ROCE_V2_GSI | - MLX4_IB_QP_CREATE_USE_GFP_NOIO)) + MLX4_IB_QP_CREATE_ROCE_V2_GSI)) return ERR_PTR(-EINVAL); if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { @@ -1154,7 +1150,6 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, return ERR_PTR(-EINVAL); if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | - MLX4_IB_QP_CREATE_USE_GFP_NOIO | MLX4_IB_QP_CREATE_ROCE_V2_GSI | MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) && init_attr->qp_type != IB_QPT_UD) || @@ -1179,7 +1174,7 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_RAW_PACKET: - qp = kzalloc(sizeof *qp, gfp); + qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); qp->pri.vid = 0xFFFF; @@ -1188,7 +1183,7 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, case IB_QPT_UD: { err = create_qp_common(to_mdev(pd->device), pd, init_attr, - udata, 0, &qp, gfp); + udata, 0, &qp); if (err) { kfree(qp); return ERR_PTR(err); @@ -1217,8 +1212,7 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, } err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, - sqpn, - &qp, gfp); + sqpn, &qp); if (err) return ERR_PTR(err); diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index e32dd58937a8..0facaf5f6d23 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -135,14 +135,14 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_mtt; } else { - err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL); + err = mlx4_db_alloc(dev->dev, &srq->db, 0); if (err) goto err_srq; *srq->db.db = 0; - if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf, - GFP_KERNEL)) { + if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, + &srq->buf)) { err = -ENOMEM; goto err_db; } @@ -167,7 +167,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL); + err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf); if (err) goto err_mtt; diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c index 249a4584401a..d94b3744a5b9 100644 --- a/drivers/net/ethernet/mellanox/mlx4/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c @@ -578,7 +578,7 @@ out: } static int mlx4_buf_direct_alloc(struct mlx4_dev *dev, int size, - struct mlx4_buf *buf, gfp_t gfp) + struct mlx4_buf *buf) { dma_addr_t t; @@ -587,7 +587,7 @@ static int mlx4_buf_direct_alloc(struct mlx4_dev *dev, int size, buf->page_shift = get_order(size) + PAGE_SHIFT; buf->direct.buf = dma_zalloc_coherent(&dev->persist->pdev->dev, - size, &t, gfp); + size, &t, GFP_KERNEL); if (!buf->direct.buf) return -ENOMEM; @@ -607,10 +607,10 @@ static int mlx4_buf_direct_alloc(struct mlx4_dev *dev, int size, * multiple pages, so we don't require too much contiguous memory. */ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, - struct mlx4_buf *buf, gfp_t gfp) + struct mlx4_buf *buf) { if (size <= max_direct) { - return mlx4_buf_direct_alloc(dev, size, buf, gfp); + return mlx4_buf_direct_alloc(dev, size, buf); } else { dma_addr_t t; int i; @@ -620,14 +620,14 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, buf->npages = buf->nbufs; buf->page_shift = PAGE_SHIFT; buf->page_list = kcalloc(buf->nbufs, sizeof(*buf->page_list), - gfp); + GFP_KERNEL); if (!buf->page_list) return -ENOMEM; for (i = 0; i < buf->nbufs; ++i) { buf->page_list[i].buf = dma_zalloc_coherent(&dev->persist->pdev->dev, - PAGE_SIZE, &t, gfp); + PAGE_SIZE, &t, GFP_KERNEL); if (!buf->page_list[i].buf) goto err_free; @@ -663,12 +663,11 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf) } EXPORT_SYMBOL_GPL(mlx4_buf_free); -static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device, - gfp_t gfp) +static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device) { struct mlx4_db_pgdir *pgdir; - pgdir = kzalloc(sizeof *pgdir, gfp); + pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL); if (!pgdir) return NULL; @@ -676,7 +675,7 @@ static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device, pgdir->bits[0] = pgdir->order0; pgdir->bits[1] = pgdir->order1; pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE, - &pgdir->db_dma, gfp); + &pgdir->db_dma, GFP_KERNEL); if (!pgdir->db_page) { kfree(pgdir); return NULL; @@ -716,7 +715,7 @@ found: return 0; } -int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, gfp_t gfp) +int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_db_pgdir *pgdir; @@ -728,7 +727,7 @@ int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, gfp_t gfp if (!mlx4_alloc_db_from_pgdir(pgdir, db, order)) goto out; - pgdir = mlx4_alloc_db_pgdir(&dev->persist->pdev->dev, gfp); + pgdir = mlx4_alloc_db_pgdir(&dev->persist->pdev->dev); if (!pgdir) { ret = -ENOMEM; goto out; @@ -780,13 +779,13 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, { int err; - err = mlx4_db_alloc(dev, &wqres->db, 1, GFP_KERNEL); + err = mlx4_db_alloc(dev, &wqres->db, 1); if (err) return err; *wqres->db.db = 0; - err = mlx4_buf_direct_alloc(dev, size, &wqres->buf, GFP_KERNEL); + err = mlx4_buf_direct_alloc(dev, size, &wqres->buf); if (err) goto err_db; @@ -795,7 +794,7 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, if (err) goto err_buf; - err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf, GFP_KERNEL); + err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf); if (err) goto err_mtt; diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c index fa6d2354a0e9..c56a511b918e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/cq.c @@ -224,11 +224,11 @@ int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn) if (*cqn == -1) return -ENOMEM; - err = mlx4_table_get(dev, &cq_table->table, *cqn, GFP_KERNEL); + err = mlx4_table_get(dev, &cq_table->table, *cqn); if (err) goto err_out; - err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn, GFP_KERNEL); + err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn); if (err) goto err_put; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index e5fb89505a13..436f7689a032 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -1042,7 +1042,7 @@ static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn, if (!context) return -ENOMEM; - err = mlx4_qp_alloc(mdev->dev, qpn, qp, GFP_KERNEL); + err = mlx4_qp_alloc(mdev->dev, qpn, qp); if (err) { en_err(priv, "Failed to allocate qp #%x\n", qpn); goto out; @@ -1086,7 +1086,7 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv) en_err(priv, "Failed reserving drop qpn\n"); return err; } - err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp, GFP_KERNEL); + err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp); if (err) { en_err(priv, "Failed allocating drop qp\n"); mlx4_qp_release_range(priv->mdev->dev, qpn, 1); @@ -1158,8 +1158,7 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) } /* Configure RSS indirection qp */ - err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, rss_map->indir_qp, - GFP_KERNEL); + err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, rss_map->indir_qp); if (err) { en_err(priv, "Failed to allocate RSS indirection QP\n"); goto rss_err; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 4f3a9b27ce4a..73faa3d77921 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -111,7 +111,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, goto err_hwq_res; } - err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->sp_qp, GFP_KERNEL); + err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->sp_qp); if (err) { en_err(priv, "Failed allocating qp %d\n", ring->qpn); goto err_reserve; diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c index e1f9e7cebf8f..5a7816e7c7b4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/icm.c +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c @@ -251,8 +251,7 @@ int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev) MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); } -int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj, - gfp_t gfp) +int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj) { u32 i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size); @@ -266,7 +265,7 @@ int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj, } table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT, - (table->lowmem ? gfp : GFP_HIGHUSER) | + (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | __GFP_NOWARN, table->coherent); if (!table->icm[i]) { ret = -ENOMEM; @@ -363,7 +362,7 @@ int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 i; for (i = start; i <= end; i += inc) { - err = mlx4_table_get(dev, table, i, GFP_KERNEL); + err = mlx4_table_get(dev, table, i); if (err) goto fail; } diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h index 0c7364550150..dee67fa39107 100644 --- a/drivers/net/ethernet/mellanox/mlx4/icm.h +++ b/drivers/net/ethernet/mellanox/mlx4/icm.h @@ -71,8 +71,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, gfp_t gfp_mask, int coherent); void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent); -int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj, - gfp_t gfp); +int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj); void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj); int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 start, u32 end); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 30616cd0140d..706d7f21ac5c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -969,7 +969,7 @@ void mlx4_cleanup_cq_table(struct mlx4_dev *dev); void mlx4_cleanup_qp_table(struct mlx4_dev *dev); void mlx4_cleanup_srq_table(struct mlx4_dev *dev); void mlx4_cleanup_mcg_table(struct mlx4_dev *dev); -int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp); +int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn); void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn); int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn); void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn); @@ -977,7 +977,7 @@ int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn); void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn); int __mlx4_mpt_reserve(struct mlx4_dev *dev); void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index); -int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp); +int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index); void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index); u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order); void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order); diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c index ce852ca22a96..24282cd017d3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mr.c +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -479,14 +479,14 @@ static void mlx4_mpt_release(struct mlx4_dev *dev, u32 index) __mlx4_mpt_release(dev, index); } -int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp) +int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index) { struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; - return mlx4_table_get(dev, &mr_table->dmpt_table, index, gfp); + return mlx4_table_get(dev, &mr_table->dmpt_table, index); } -static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp) +static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index) { u64 param = 0; @@ -497,7 +497,7 @@ static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp) MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); } - return __mlx4_mpt_alloc_icm(dev, index, gfp); + return __mlx4_mpt_alloc_icm(dev, index); } void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index) @@ -629,7 +629,7 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) struct mlx4_mpt_entry *mpt_entry; int err; - err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key), GFP_KERNEL); + err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key)); if (err) return err; @@ -787,14 +787,13 @@ int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, EXPORT_SYMBOL_GPL(mlx4_write_mtt); int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, - struct mlx4_buf *buf, gfp_t gfp) + struct mlx4_buf *buf) { u64 *page_list; int err; int i; - page_list = kmalloc(buf->npages * sizeof *page_list, - gfp); + page_list = kcalloc(buf->npages, sizeof(*page_list), GFP_KERNEL); if (!page_list) return -ENOMEM; @@ -841,7 +840,7 @@ int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw) struct mlx4_mpt_entry *mpt_entry; int err; - err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key), GFP_KERNEL); + err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key)); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 5a310d313e94..26747212526b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -301,29 +301,29 @@ void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) } EXPORT_SYMBOL_GPL(mlx4_qp_release_range); -int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp) +int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_qp_table *qp_table = &priv->qp_table; int err; - err = mlx4_table_get(dev, &qp_table->qp_table, qpn, gfp); + err = mlx4_table_get(dev, &qp_table->qp_table, qpn); if (err) goto err_out; - err = mlx4_table_get(dev, &qp_table->auxc_table, qpn, gfp); + err = mlx4_table_get(dev, &qp_table->auxc_table, qpn); if (err) goto err_put_qp; - err = mlx4_table_get(dev, &qp_table->altc_table, qpn, gfp); + err = mlx4_table_get(dev, &qp_table->altc_table, qpn); if (err) goto err_put_auxc; - err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn, gfp); + err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn); if (err) goto err_put_altc; - err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn, gfp); + err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn); if (err) goto err_put_rdmarc; @@ -345,7 +345,7 @@ err_out: return err; } -static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp) +static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn) { u64 param = 0; @@ -355,7 +355,7 @@ static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp) MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); } - return __mlx4_qp_alloc_icm(dev, qpn, gfp); + return __mlx4_qp_alloc_icm(dev, qpn); } void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn) @@ -397,7 +397,7 @@ struct mlx4_qp *mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn) return qp; } -int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp, gfp_t gfp) +int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_qp_table *qp_table = &priv->qp_table; @@ -408,7 +408,7 @@ int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp, gfp_t gfp) qp->qpn = qpn; - err = mlx4_qp_alloc_icm(dev, qpn, gfp); + err = mlx4_qp_alloc_icm(dev, qpn); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 812783865205..215e21c3dc8a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -1822,7 +1822,7 @@ static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, return err; if (!fw_reserved(dev, qpn)) { - err = __mlx4_qp_alloc_icm(dev, qpn, GFP_KERNEL); + err = __mlx4_qp_alloc_icm(dev, qpn); if (err) { res_abort_move(dev, slave, RES_QP, qpn); return err; @@ -1909,7 +1909,7 @@ static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, if (err) return err; - err = __mlx4_mpt_alloc_icm(dev, mpt->key, GFP_KERNEL); + err = __mlx4_mpt_alloc_icm(dev, mpt->key); if (err) { res_abort_move(dev, slave, RES_MPT, id); return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c index f44d089e2ca6..bedf52126824 100644 --- a/drivers/net/ethernet/mellanox/mlx4/srq.c +++ b/drivers/net/ethernet/mellanox/mlx4/srq.c @@ -100,11 +100,11 @@ int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn) if (*srqn == -1) return -ENOMEM; - err = mlx4_table_get(dev, &srq_table->table, *srqn, GFP_KERNEL); + err = mlx4_table_get(dev, &srq_table->table, *srqn); if (err) goto err_out; - err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn, GFP_KERNEL); + err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn); if (err) goto err_put; return 0; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index d5bed0875d30..aad5d81dfb44 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1068,7 +1068,7 @@ static inline int mlx4_is_eth(struct mlx4_dev *dev, int port) } int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, - struct mlx4_buf *buf, gfp_t gfp); + struct mlx4_buf *buf); void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) { @@ -1105,10 +1105,9 @@ int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw); int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, int start_index, int npages, u64 *page_list); int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, - struct mlx4_buf *buf, gfp_t gfp); + struct mlx4_buf *buf); -int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, - gfp_t gfp); +int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order); void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db); int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, @@ -1124,8 +1123,7 @@ int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base, u8 flags); void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); -int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp, - gfp_t gfp); +int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp); void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp); int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcdn, -- cgit v1.2.3-71-gd317 From 7855f5842741e5835b9be073079780c444dca898 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 23 May 2017 14:38:16 +0300 Subject: IB/core: Remove NOIO QP create flag There are no users for IB_QP_CREATE_USE_GFP_NOIO flag, so let's remove it. Signed-off-by: Leon Romanovsky Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6e62c9e2c971..b5732432bb29 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1056,7 +1056,7 @@ enum ib_qp_create_flags { IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_SIGNATURE_EN = 1 << 6, - IB_QP_CREATE_USE_GFP_NOIO = 1 << 7, + /* FREE = 1 << 7, */ IB_QP_CREATE_SCATTER_FCS = 1 << 8, IB_QP_CREATE_CVLAN_STRIPPING = 1 << 9, /* reserve bits 26-31 for low level drivers' internal use */ -- cgit v1.2.3-71-gd317 From 8bd226f9a7dc18740a916dcba3112f2bfc3ad9e8 Mon Sep 17 00:00:00 2001 From: Ruslan Bilovol Date: Sun, 25 Jun 2017 16:23:45 +0300 Subject: include: usb: audio: specify exact endiannes of descriptors USB spec says that multiple byte fields are stored in little-endian order (see chapter 8.1 of USB2.0 spec and chapter 7.1 of USB3.0 spec), thus mark such fields as LE for UAC1 and UAC2 headers Signed-off-by: Ruslan Bilovol Signed-off-by: Felipe Balbi --- include/linux/usb/audio-v2.h | 14 +++++++------- include/uapi/linux/usb/audio.h | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h index c5f2158ab00e..fd73bc0e9027 100644 --- a/include/linux/usb/audio-v2.h +++ b/include/linux/usb/audio-v2.h @@ -115,13 +115,13 @@ struct uac2_input_terminal_descriptor { __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bTerminalID; - __u16 wTerminalType; + __le16 wTerminalType; __u8 bAssocTerminal; __u8 bCSourceID; __u8 bNrChannels; - __u32 bmChannelConfig; + __le32 bmChannelConfig; __u8 iChannelNames; - __u16 bmControls; + __le16 bmControls; __u8 iTerminal; } __attribute__((packed)); @@ -132,11 +132,11 @@ struct uac2_output_terminal_descriptor { __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bTerminalID; - __u16 wTerminalType; + __le16 wTerminalType; __u8 bAssocTerminal; __u8 bSourceID; __u8 bCSourceID; - __u16 bmControls; + __le16 bmControls; __u8 iTerminal; } __attribute__((packed)); @@ -164,9 +164,9 @@ struct uac2_as_header_descriptor { __u8 bTerminalLink; __u8 bmControls; __u8 bFormatType; - __u32 bmFormats; + __le32 bmFormats; __u8 bNrChannels; - __u32 bmChannelConfig; + __le32 bmChannelConfig; __u8 iChannelNames; } __attribute__((packed)); diff --git a/include/uapi/linux/usb/audio.h b/include/uapi/linux/usb/audio.h index d2314be4f0c0..a4680a5bf5dd 100644 --- a/include/uapi/linux/usb/audio.h +++ b/include/uapi/linux/usb/audio.h @@ -333,7 +333,7 @@ struct uac_processing_unit_descriptor { __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bUnitID; - __u16 wProcessType; + __le16 wProcessType; __u8 bNrInPins; __u8 baSourceID[]; } __attribute__ ((packed)); @@ -491,8 +491,8 @@ struct uac_format_type_ii_ext_descriptor { __u8 bDescriptorType; __u8 bDescriptorSubtype; __u8 bFormatType; - __u16 wMaxBitRate; - __u16 wSamplesPerFrame; + __le16 wMaxBitRate; + __le16 wSamplesPerFrame; __u8 bHeaderLength; __u8 bSideBandProtocol; } __attribute__((packed)); -- cgit v1.2.3-71-gd317 From beaec533fc2701a28a4d667f67c9f59c6e4e0d13 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 19 Jul 2017 20:27:30 +0200 Subject: llist: clang: introduce member_address_is_nonnull() Currently llist_for_each_entry() and llist_for_each_entry_safe() iterate until &pos->member != NULL. But when building the kernel with Clang, the compiler assumes &pos->member cannot be NULL if the member's offset is greater than 0 (which would be equivalent to the object being non-contiguous in memory). Therefore the loop condition is always true, and the loops become infinite. To work around this, introduce the member_address_is_nonnull() macro, which casts object pointer to uintptr_t, thus letting the member pointer to be NULL. Signed-off-by: Alexander Potapenko Tested-by: Sodagudi Prasad Signed-off-by: Linus Torvalds --- include/linux/llist.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/llist.h b/include/linux/llist.h index d11738110a7a..1957635e6d5f 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -92,6 +92,23 @@ static inline void init_llist_head(struct llist_head *list) #define llist_entry(ptr, type, member) \ container_of(ptr, type, member) +/** + * member_address_is_nonnull - check whether the member address is not NULL + * @ptr: the object pointer (struct type * that contains the llist_node) + * @member: the name of the llist_node within the struct. + * + * This macro is conceptually the same as + * &ptr->member != NULL + * but it works around the fact that compilers can decide that taking a member + * address is never a NULL pointer. + * + * Real objects that start at a high address and have a member at NULL are + * unlikely to exist, but such pointers may be returned e.g. by the + * container_of() macro. + */ +#define member_address_is_nonnull(ptr, member) \ + ((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0) + /** * llist_for_each - iterate over some deleted entries of a lock-less list * @pos: the &struct llist_node to use as a loop cursor @@ -145,7 +162,7 @@ static inline void init_llist_head(struct llist_head *list) */ #define llist_for_each_entry(pos, node, member) \ for ((pos) = llist_entry((node), typeof(*(pos)), member); \ - &(pos)->member != NULL; \ + member_address_is_nonnull(pos, member); \ (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member)) /** @@ -167,7 +184,7 @@ static inline void init_llist_head(struct llist_head *list) */ #define llist_for_each_entry_safe(pos, n, node, member) \ for (pos = llist_entry((node), typeof(*pos), member); \ - &pos->member != NULL && \ + member_address_is_nonnull(pos, member) && \ (n = llist_entry(pos->member.next, typeof(*n), member), true); \ pos = n) -- cgit v1.2.3-71-gd317 From f86f418059b94aa01f9342611a272ca60c583e89 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Wed, 7 Jun 2017 16:12:51 +0800 Subject: trace: fix the errors caused by incompatible type of RCU variables The variables which are processed by RCU functions should be annotated as RCU, otherwise sparse will report the errors like below: "error: incompatible types in comparison expression (different address spaces)" Link: http://lkml.kernel.org/r/1496823171-7758-1-git-send-email-zhang.chunyan@linaro.org Signed-off-by: Chunyan Zhang [ Updated to not be 100% 80 column strict ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 +++--- include/linux/trace_events.h | 2 +- kernel/trace/ftrace.c | 41 +++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 6 +++--- 4 files changed, 34 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5857390ac35a..6383115e9d2c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -145,8 +145,8 @@ enum { #ifdef CONFIG_DYNAMIC_FTRACE /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { - struct ftrace_hash *notrace_hash; - struct ftrace_hash *filter_hash; + struct ftrace_hash __rcu *notrace_hash; + struct ftrace_hash __rcu *filter_hash; struct mutex regex_lock; }; @@ -168,7 +168,7 @@ static inline void ftrace_free_init_mem(void) { } */ struct ftrace_ops { ftrace_func_t func; - struct ftrace_ops *next; + struct ftrace_ops __rcu *next; unsigned long flags; void *private; ftrace_func_t saved_func; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index f73cedfa2e0b..536c80ff7ad9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -338,7 +338,7 @@ enum { struct trace_event_file { struct list_head list; struct trace_event_call *event_call; - struct event_filter *filter; + struct event_filter __rcu *filter; struct dentry *dir; struct trace_array *tr; struct trace_subsystem_dir *system; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 53f6b6401cf0..02004ae91860 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void) mutex_lock(&ftrace_lock); - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) + for (ops = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); + ops != &ftrace_list_end; + ops = rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock))) cnt++; mutex_unlock(&ftrace_lock); @@ -275,10 +278,11 @@ static void update_ftrace_function(void) * If there's only one ftrace_ops registered, the ftrace_ops_list * will point to the ops we want. */ - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); /* If there's no ftrace_ops registered, just call the stub function */ - if (ftrace_ops_list == &ftrace_list_end) { + if (set_function_trace_op == &ftrace_list_end) { func = ftrace_stub; /* @@ -286,7 +290,8 @@ static void update_ftrace_function(void) * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - } else if (ftrace_ops_list->next == &ftrace_list_end) { + } else if (rcu_dereference_protected(ftrace_ops_list->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { func = ftrace_ops_get_list_func(ftrace_ops_list); } else { @@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void) return ftrace_trace_function == ftrace_ops_list_func; } -static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static void add_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { - ops->next = *list; + rcu_assign_pointer(ops->next, *list); + /* * We are entering ops into the list but another * CPU might be walking that list. We need to make sure @@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) rcu_assign_pointer(*list, ops); } -static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (*list == ops && ops->next == &ftrace_list_end) { + if (rcu_dereference_protected(*list, + lockdep_is_held(&ftrace_lock)) == ops && + rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { *list = &ftrace_list_end; return 0; } @@ -1569,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); + rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); if (hash_contains_ip(ip, &hash)) ret = 1; @@ -2840,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If there's no more ops registered with ftrace, run a * sanity check to make sure all rec flags are cleared. */ - if (ftrace_ops_list == &ftrace_list_end) { + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -6453,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) update_ftrace_function(); ftrace_startup_sysctl(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6ade1c55cc3a..490ba229931d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1210,9 +1210,9 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred *preds; - struct filter_pred *root; - char *filter_string; + struct filter_pred __rcu *preds; + struct filter_pred __rcu *root; + char *filter_string; }; struct event_subsystem { -- cgit v1.2.3-71-gd317 From 43fc509c3efb5c973991ee24c449ab2a0d71dd1e Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Thu, 20 Jul 2017 11:19:58 +0100 Subject: dma-coherent: introduce interface for default DMA pool Christoph noticed [1] that default DMA pool in current form overload the DMA coherent infrastructure. In reply, Robin suggested [2] to split the per-device vs. global pool interfaces, so allocation/release from default DMA pool is driven by dma ops implementation. This patch implements Robin's idea and provide interface to allocate/release/mmap the default (aka global) DMA pool. To make it clear that existing *_from_coherent routines work on per-device pool rename them to *_from_dev_coherent. [1] https://lkml.org/lkml/2017/7/7/370 [2] https://lkml.org/lkml/2017/7/7/431 Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Suggested-by: Robin Murphy Tested-by: Andras Szemzo Reviewed-by: Robin Murphy Signed-off-by: Vladimir Murzin Signed-off-by: Christoph Hellwig --- arch/arc/mm/dma.c | 2 +- arch/arm/mm/dma-mapping.c | 2 +- arch/arm64/mm/dma-mapping.c | 4 +- arch/mips/mm/dma-default.c | 2 +- drivers/base/dma-coherent.c | 164 ++++++++++++++++++++++++++++---------------- drivers/base/dma-mapping.c | 2 +- include/linux/dma-mapping.h | 40 ++++++++--- 7 files changed, 144 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 2a07e6ecafbd..71d3efff99d3 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -117,7 +117,7 @@ static int arc_dma_mmap(struct device *dev, struct vm_area_struct *vma, vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; if (off < count && user_count <= (count - off)) { diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index e7380bafbfa6..fcf1473d6fed 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -851,7 +851,7 @@ static int __arm_dma_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long pfn = dma_to_pfn(dev, dma_addr); unsigned long off = vma->vm_pgoff; - if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) { diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index e90cd1db42a8..f27d4dd04384 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -329,7 +329,7 @@ static int __swiotlb_mmap(struct device *dev, vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot, is_device_dma_coherent(dev)); - if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; return __swiotlb_mmap_pfn(vma, pfn, size); @@ -706,7 +706,7 @@ static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma, vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot, is_device_dma_coherent(dev)); - if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c index e08598c70b3e..8e78251eccc2 100644 --- a/arch/mips/mm/dma-default.c +++ b/arch/mips/mm/dma-default.c @@ -232,7 +232,7 @@ static int mips_dma_mmap(struct device *dev, struct vm_area_struct *vma, else vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; if (off < count && user_count <= (count - off)) { diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c index 2ae24c28e70c..1c152aed6b82 100644 --- a/drivers/base/dma-coherent.c +++ b/drivers/base/dma-coherent.c @@ -25,7 +25,7 @@ static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *de { if (dev && dev->dma_mem) return dev->dma_mem; - return dma_coherent_default_memory; + return NULL; } static inline dma_addr_t dma_get_device_base(struct device *dev, @@ -165,34 +165,15 @@ void *dma_mark_declared_memory_occupied(struct device *dev, } EXPORT_SYMBOL(dma_mark_declared_memory_occupied); -/** - * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area - * - * @dev: device from which we allocate memory - * @size: size of requested memory area - * @dma_handle: This will be filled with the correct dma handle - * @ret: This pointer will be filled with the virtual address - * to allocated area. - * - * This function should be only called from per-arch dma_alloc_coherent() - * to support allocation from per-device coherent memory pools. - * - * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return @ret. - */ -int dma_alloc_from_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) +static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, + ssize_t size, dma_addr_t *dma_handle) { - struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); int order = get_order(size); unsigned long flags; int pageno; int dma_memory_map; + void *ret; - if (!mem) - return 0; - - *ret = NULL; spin_lock_irqsave(&mem->spinlock, flags); if (unlikely(size > (mem->size << PAGE_SHIFT))) @@ -203,21 +184,50 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, goto err; /* - * Memory was found in the per-device area. + * Memory was found in the coherent area. */ - *dma_handle = dma_get_device_base(dev, mem) + (pageno << PAGE_SHIFT); - *ret = mem->virt_base + (pageno << PAGE_SHIFT); + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + ret = mem->virt_base + (pageno << PAGE_SHIFT); dma_memory_map = (mem->flags & DMA_MEMORY_MAP); spin_unlock_irqrestore(&mem->spinlock, flags); if (dma_memory_map) - memset(*ret, 0, size); + memset(ret, 0, size); else - memset_io(*ret, 0, size); + memset_io(ret, 0, size); - return 1; + return ret; err: spin_unlock_irqrestore(&mem->spinlock, flags); + return NULL; +} + +/** + * dma_alloc_from_dev_coherent() - allocate memory from device coherent pool + * @dev: device from which we allocate memory + * @size: size of requested memory area + * @dma_handle: This will be filled with the correct dma handle + * @ret: This pointer will be filled with the virtual address + * to allocated area. + * + * This function should be only called from per-arch dma_alloc_coherent() + * to support allocation from per-device coherent memory pools. + * + * Returns 0 if dma_alloc_coherent should continue with allocating from + * generic memory areas, or !0 if dma_alloc_coherent should return @ret. + */ +int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle, void **ret) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + if (!mem) + return 0; + + *ret = __dma_alloc_from_coherent(mem, size, dma_handle); + if (*ret) + return 1; + /* * In the case where the allocation can not be satisfied from the * per-device area, try to fall back to generic memory if the @@ -225,25 +235,20 @@ err: */ return mem->flags & DMA_MEMORY_EXCLUSIVE; } -EXPORT_SYMBOL(dma_alloc_from_coherent); +EXPORT_SYMBOL(dma_alloc_from_dev_coherent); -/** - * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool - * @dev: device from which the memory was allocated - * @order: the order of pages allocated - * @vaddr: virtual address of allocated pages - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, releases that memory. - * - * Returns 1 if we correctly released the memory, or 0 if - * dma_release_coherent() should proceed with releasing memory from - * generic pools. - */ -int dma_release_from_coherent(struct device *dev, int order, void *vaddr) +void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) { - struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + if (!dma_coherent_default_memory) + return NULL; + + return __dma_alloc_from_coherent(dma_coherent_default_memory, size, + dma_handle); +} +static int __dma_release_from_coherent(struct dma_coherent_mem *mem, + int order, void *vaddr) +{ if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; @@ -256,28 +261,39 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr) } return 0; } -EXPORT_SYMBOL(dma_release_from_coherent); /** - * dma_mmap_from_coherent() - try to mmap the memory allocated from - * per-device coherent memory pool to userspace + * dma_release_from_dev_coherent() - free memory to device coherent memory pool * @dev: device from which the memory was allocated - * @vma: vm_area for the userspace memory - * @vaddr: cpu address returned by dma_alloc_from_coherent - * @size: size of the memory buffer allocated by dma_alloc_from_coherent - * @ret: result from remap_pfn_range() + * @order: the order of pages allocated + * @vaddr: virtual address of allocated pages * * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, maps that memory to the provided vma. + * coherent memory pool and if so, releases that memory. * - * Returns 1 if we correctly mapped the memory, or 0 if the caller should - * proceed with mapping memory from generic pools. + * Returns 1 if we correctly released the memory, or 0 if the caller should + * proceed with releasing memory from generic pools. */ -int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma, - void *vaddr, size_t size, int *ret) +int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr) { struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + return __dma_release_from_coherent(mem, order, vaddr); +} +EXPORT_SYMBOL(dma_release_from_dev_coherent); + +int dma_release_from_global_coherent(int order, void *vaddr) +{ + if (!dma_coherent_default_memory) + return 0; + + return __dma_release_from_coherent(dma_coherent_default_memory, order, + vaddr); +} + +static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem, + struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) +{ if (mem && vaddr >= mem->virt_base && vaddr + size <= (mem->virt_base + (mem->size << PAGE_SHIFT))) { unsigned long off = vma->vm_pgoff; @@ -296,7 +312,39 @@ int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma, } return 0; } -EXPORT_SYMBOL(dma_mmap_from_coherent); + +/** + * dma_mmap_from_dev_coherent() - mmap memory from the device coherent pool + * @dev: device from which the memory was allocated + * @vma: vm_area for the userspace memory + * @vaddr: cpu address returned by dma_alloc_from_dev_coherent + * @size: size of the memory buffer allocated + * @ret: result from remap_pfn_range() + * + * This checks whether the memory was allocated from the per-device + * coherent memory pool and if so, maps that memory to the provided vma. + * + * Returns 1 if we correctly mapped the memory, or 0 if the caller should + * proceed with mapping memory from generic pools. + */ +int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, + void *vaddr, size_t size, int *ret) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret); +} +EXPORT_SYMBOL(dma_mmap_from_dev_coherent); + +int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, + size_t size, int *ret) +{ + if (!dma_coherent_default_memory) + return 0; + + return __dma_mmap_from_coherent(dma_coherent_default_memory, vma, + vaddr, size, ret); +} /* * Support for reserved memory regions defined in device tree diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c index 5096755d185e..b555ff9dd8fc 100644 --- a/drivers/base/dma-mapping.c +++ b/drivers/base/dma-mapping.c @@ -235,7 +235,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; if (off < count && user_count <= (count - off)) { diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 843ab866e0f4..03c0196a6f24 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -157,16 +157,40 @@ static inline int is_device_dma_capable(struct device *dev) * These three functions are only for dma allocator. * Don't use them in device drivers. */ -int dma_alloc_from_coherent(struct device *dev, ssize_t size, +int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret); -int dma_release_from_coherent(struct device *dev, int order, void *vaddr); +int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr); -int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma, +int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); + +void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle); +int dma_release_from_global_coherent(int order, void *vaddr); +int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, + size_t size, int *ret); + #else -#define dma_alloc_from_coherent(dev, size, handle, ret) (0) -#define dma_release_from_coherent(dev, order, vaddr) (0) -#define dma_mmap_from_coherent(dev, vma, vaddr, order, ret) (0) +#define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0) +#define dma_release_from_dev_coherent(dev, order, vaddr) (0) +#define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0) + +static inline void *dma_alloc_from_global_coherent(ssize_t size, + dma_addr_t *dma_handle) +{ + return NULL; +} + +static inline int dma_release_from_global_coherent(int order, void *vaddr) +{ + return 0; +} + +static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, + void *cpu_addr, size_t size, + int *ret) +{ + return 0; +} #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ #ifdef CONFIG_HAS_DMA @@ -481,7 +505,7 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size, BUG_ON(!ops); - if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr)) + if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; if (!arch_dma_alloc_attrs(&dev, &flag)) @@ -503,7 +527,7 @@ static inline void dma_free_attrs(struct device *dev, size_t size, BUG_ON(!ops); WARN_ON(irqs_disabled()); - if (dma_release_from_coherent(dev, get_order(size), cpu_addr)) + if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr)) return; if (!ops->free || !cpu_addr) -- cgit v1.2.3-71-gd317 From dc1a0afbacaeaced8f5679a99047c0467f1099e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Jul 2017 11:12:09 +0200 Subject: nvme: fix byte swapping in the streams code Signed-off-by: Christoph Hellwig Reviewed-by: Jens Axboe Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- include/linux/nvme.h | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cb96f4a7ae3a..3b77cfe5aa1e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -336,7 +336,7 @@ static int nvme_get_stream_params(struct nvme_ctrl *ctrl, c.directive.opcode = nvme_admin_directive_recv; c.directive.nsid = cpu_to_le32(nsid); - c.directive.numd = sizeof(*s); + c.directive.numd = cpu_to_le32(sizeof(*s)); c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; c.directive.dtype = NVME_DIR_STREAMS; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 6b8ee9e628e1..bc74da018bdc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -963,14 +963,14 @@ struct nvme_dbbuf { }; struct streams_directive_params { - __u16 msl; - __u16 nssa; - __u16 nsso; + __le16 msl; + __le16 nssa; + __le16 nsso; __u8 rsvd[10]; - __u32 sws; - __u16 sgs; - __u16 nsa; - __u16 nso; + __le32 sws; + __le16 sgs; + __le16 nsa; + __le16 nso; __u8 rsvd2[6]; }; -- cgit v1.2.3-71-gd317 From a25ce4270bfdd522207b02f81a594c7d1746b697 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Sat, 17 Jun 2017 10:37:26 -0700 Subject: IB/rdmavt: Setting of QP timeout can overflow jiffies computation Current computation of qp->timeout_jiffies in rvt_modify_qp() will cause overflow due to the fact that the input to the function usecs_to_jiffies is only 32-bit ( unsigned int). Overflow will occur when attr->timeout is equal to or greater than 30. The consequence is unnecessarily excessive retry and thus degradation of the system performance. This patch fixes the problem by limiting the input to 5-bit and calling usecs_to_jiffies() before multiplying the scaling factor. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 4 +--- include/rdma/rdmavt_qp.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 459865439a0b..8876ee7bc326 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1258,9 +1258,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_TIMEOUT) { qp->timeout = attr->timeout; - qp->timeout_jiffies = - usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / - 1000UL); + qp->timeout_jiffies = rvt_timeout_to_jiffies(qp->timeout); } if (attr_mask & IB_QP_QKEY) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index be6472e5b06b..d664d2e76280 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -647,6 +647,20 @@ static inline u32 rvt_div_mtu(struct rvt_qp *qp, u32 len) return len >> qp->log_pmtu; } +/** + * rvt_timeout_to_jiffies - Convert a ULP timeout input into jiffies + * @timeout - timeout input(0 - 31). + * + * Return a timeout value in jiffies. + */ +static inline unsigned long rvt_timeout_to_jiffies(u8 timeout) +{ + if (timeout > 31) + timeout = 31; + + return usecs_to_jiffies(1U << timeout) * 4096UL / 1000UL; +} + extern const int ib_rvt_state_ops[]; struct rvt_dev_info; -- cgit v1.2.3-71-gd317 From 963916fdb3e5ad4af57ac959b5a03bf23f7568ca Mon Sep 17 00:00:00 2001 From: "Kalderon, Michal" Date: Thu, 6 Jul 2017 23:22:11 +0300 Subject: IB/cma: Fix reference count leak when no ipv4 addresses are set Once in_dev_get is called to receive in_device pointer, the in_device reference counter is increased, but if there are no ipv4 addresses configured on the net-device the ifa_list will be null, resulting in a flow that doesn't call in_dev_put to decrease the ref_cnt. This was exposed when running RoCE over ipv6 without any ipv4 addresses configured Fixes: commit 8e3867310c90 ("IB/cma: Fix a race condition in iboe_addr_get_sgid()") Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: Doug Ledford --- include/rdma/ib_addr.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 4b34c51f859e..b73a14edc85e 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -205,11 +205,13 @@ static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); if (dev) { ip4 = in_dev_get(dev); - if (ip4 && ip4->ifa_list && ip4->ifa_list->ifa_address) { + if (ip4 && ip4->ifa_list && ip4->ifa_list->ifa_address) ipv6_addr_set_v4mapped(ip4->ifa_list->ifa_address, (struct in6_addr *)gid); + + if (ip4) in_dev_put(ip4); - } + dev_put(dev); } } -- cgit v1.2.3-71-gd317 From 4cabc5b186b5427b9ee5a7495172542af105f02b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Jul 2017 00:00:21 +0200 Subject: bpf: fix mixed signed/unsigned derived min/max value bounds Edward reported that there's an issue in min/max value bounds tracking when signed and unsigned compares both provide hints on limits when having unknown variables. E.g. a program such as the following should have been rejected: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff8a94cda93400 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 11: (65) if r1 s> 0x1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=0,max_value=1 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 14: (b7) r0 = 0 15: (95) exit What happens is that in the first part ... 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 ... r1 carries an unsigned value, and is compared as unsigned against a register carrying an immediate. Verifier deduces in reg_set_min_max() that since the compare is unsigned and operation is greater than (>), that in the fall-through/false case, r1's minimum bound must be 0 and maximum bound must be r2. Latter is larger than the bound and thus max value is reset back to being 'invalid' aka BPF_REGISTER_MAX_RANGE. Thus, r1 state is now 'R1=inv,min_value=0'. The subsequent test ... 11: (65) if r1 s> 0x1 goto pc+2 ... is a signed compare of r1 with immediate value 1. Here, verifier deduces in reg_set_min_max() that since the compare is signed this time and operation is greater than (>), that in the fall-through/false case, we can deduce that r1's maximum bound must be 1, meaning with prior test, we result in r1 having the following state: R1=inv,min_value=0,max_value=1. Given that the actual value this holds is -8, the bounds are wrongly deduced. When this is being added to r0 which holds the map_value(_adj) type, then subsequent store access in above case will go through check_mem_access() which invokes check_map_access_adj(), that will then probe whether the map memory is in bounds based on the min_value and max_value as well as access size since the actual unknown value is min_value <= x <= max_value; commit fce366a9dd0d ("bpf, verifier: fix alu ops against map_value{, _adj} register types") provides some more explanation on the semantics. It's worth to note in this context that in the current code, min_value and max_value tracking are used for two things, i) dynamic map value access via check_map_access_adj() and since commit 06c1c049721a ("bpf: allow helpers access to variable memory") ii) also enforced at check_helper_mem_access() when passing a memory address (pointer to packet, map value, stack) and length pair to a helper and the length in this case is an unknown value defining an access range through min_value/max_value in that case. The min_value/max_value tracking is /not/ used in the direct packet access case to track ranges. However, the issue also affects case ii), for example, the following crafted program based on the same principle must be rejected as well: 0: (b7) r2 = 0 1: (bf) r3 = r10 2: (07) r3 += -512 3: (7a) *(u64 *)(r10 -16) = -8 4: (79) r4 = *(u64 *)(r10 -16) 5: (b7) r6 = -1 6: (2d) if r4 > r6 goto pc+5 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 7: (65) if r4 s> 0x1 goto pc+4 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0,max_value=1 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 8: (07) r4 += 1 9: (b7) r5 = 0 10: (6a) *(u16 *)(r10 -512) = 0 11: (85) call bpf_skb_load_bytes#26 12: (b7) r0 = 0 13: (95) exit Meaning, while we initialize the max_value stack slot that the verifier thinks we access in the [1,2] range, in reality we pass -7 as length which is interpreted as u32 in the helper. Thus, this issue is relevant also for the case of helper ranges. Resetting both bounds in check_reg_overflow() in case only one of them exceeds limits is also not enough as similar test can be created that uses values which are within range, thus also here learned min value in r1 is incorrect when mixed with later signed test to create a range: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff880ad081fa00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (65) if r1 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=3,max_value=4 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 14: (b7) r0 = 0 15: (95) exit This leaves us with two options for fixing this: i) to invalidate all prior learned information once we switch signed context, ii) to track min/max signed and unsigned boundaries separately as done in [0]. (Given latter introduces major changes throughout the whole verifier, it's rather net-next material, thus this patch follows option i), meaning we can derive bounds either from only signed tests or only unsigned tests.) There is still the case of adjust_reg_min_max_vals(), where we adjust bounds on ALU operations, meaning programs like the following where boundaries on the reg get mixed in context later on when bounds are merged on the dst reg must get rejected, too: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff89b2bf87ce00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+6 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (b7) r7 = 1 12: (65) if r7 s> 0x0 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,max_value=0 R10=fp 13: (b7) r0 = 0 14: (95) exit from 12 to 15: R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,min_value=1 R10=fp 15: (0f) r7 += r1 16: (65) if r7 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 17: (0f) r0 += r7 18: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=4,max_value=4 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 19: (b7) r0 = 0 20: (95) exit Meaning, in adjust_reg_min_max_vals() we must also reset range values on the dst when src/dst registers have mixed signed/ unsigned derived min/max value bounds with one unbounded value as otherwise they can be added together deducing false boundaries. Once both boundaries are established from either ALU ops or compare operations w/o mixing signed/unsigned insns, then they can safely be added to other regs also having both boundaries established. Adding regs with one unbounded side to a map value where the bounded side has been learned w/o mixing ops is possible, but the resulting map value won't recover from that, meaning such op is considered invalid on the time of actual access. Invalid bounds are set on the dst reg in case i) src reg, or ii) in case dst reg already had them. The only way to recover would be to perform i) ALU ops but only 'add' is allowed on map value types or ii) comparisons, but these are disallowed on pointers in case they span a range. This is fine as only BPF_JEQ and BPF_JNE may be performed on PTR_TO_MAP_VALUE_OR_NULL registers which potentially turn them into PTR_TO_MAP_VALUE type depending on the branch, so only here min/max value cannot be invalidated for them. In terms of state pruning, value_from_signed is considered as well in states_equal() when dealing with adjusted map values. With regards to breaking existing programs, there is a small risk, but use-cases are rather quite narrow where this could occur and mixing compares probably unlikely. Joint work with Josef and Edward. [0] https://lists.iovisor.org/pipermail/iovisor-dev/2017-June/000822.html Fixes: 484611357c19 ("bpf: allow access into map value arrays") Reported-by: Edward Cree Signed-off-by: Daniel Borkmann Signed-off-by: Edward Cree Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 108 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 95 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 621076f56251..8e5d31f6faef 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -43,6 +43,7 @@ struct bpf_reg_state { u32 min_align; u32 aux_off; u32 aux_off_align; + bool value_from_signed; }; enum bpf_stack_slot_type { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a86723c5b64..af9e84a4944e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; + regs[regno].value_from_signed = false; regs[regno].min_align = 0; } @@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } -static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +static bool __is_pointer_value(bool allow_ptr_leaks, + const struct bpf_reg_state *reg) { - if (env->allow_ptr_leaks) + if (allow_ptr_leaks) return false; - switch (env->cur_state.regs[regno].type) { + switch (reg->type) { case UNKNOWN_VALUE: case CONST_IMM: return false; @@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } } +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +{ + return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); +} + static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, int off, int size, bool strict) { @@ -1832,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_align = dst_reg->min_align; /* We don't know anything about what was done to this register, mark it - * as unknown. + * as unknown. Also, if both derived bounds came from signed/unsigned + * mixed compares and one side is unbounded, we cannot really do anything + * with them as boundaries cannot be trusted. Thus, arithmetic of two + * regs of such kind will get invalidated bounds on the dst side. */ - if (min_val == BPF_REGISTER_MIN_RANGE && - max_val == BPF_REGISTER_MAX_RANGE) { + if ((min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (BPF_SRC(insn->code) == BPF_X && + ((min_val != BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (min_val == BPF_REGISTER_MIN_RANGE && + max_val != BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && + dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && + dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && + regs[insn->dst_reg].value_from_signed != + regs[insn->src_reg].value_from_signed)) { reset_reg_range_values(regs, insn->dst_reg); return; } @@ -2023,6 +2044,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); + regs[insn->dst_reg].value_from_signed = false; } } else if (opcode > BPF_END) { @@ -2198,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. */ false_reg->max_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val + 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. */ false_reg->max_value = val - 1; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2239,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg @@ -2248,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* * If this is false, then the val is <= the register, if it is * true the register <= to the val. */ false_reg->min_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val - 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* If this is false then constant < register, if it is true then * the register < constant. */ false_reg->min_value = val + 1; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2290,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, -- cgit v1.2.3-71-gd317 From bd8b2441742b49c76bec707757bd9c028ea9838e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Jul 2017 17:54:34 -0400 Subject: NFS: Store the raw NFS access mask in the inode's access cache Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 9 ++++++--- include/linux/nfs_fs.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 24b3a6748062..8fae8b00b8f5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2399,7 +2399,7 @@ nfs_access_calc_mask(u32 access_result) void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) { - entry->mask = nfs_access_calc_mask(access_result); + entry->mask = access_result; } EXPORT_SYMBOL_GPL(nfs_access_set_mask); @@ -2407,6 +2407,7 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { struct nfs_access_entry cache; bool may_block = (mask & MAY_NOT_BLOCK) == 0; + int cache_mask; int status; trace_nfs_access_enter(inode); @@ -2422,7 +2423,8 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) goto out; /* Be clever: ask server to check for all possible rights */ - cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; + cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE + | NFS_MAY_WRITE | NFS_MAY_READ; cache.cred = cred; cache.jiffies = jiffies; status = NFS_PROTO(inode)->access(inode, &cache); @@ -2436,7 +2438,8 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) } nfs_access_add_cache(inode, &cache); out_cached: - if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) + cache_mask = nfs_access_calc_mask(cache.mask); + if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) status = -EACCES; out: trace_nfs_access_exit(inode, status); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index e52cc55ac300..5cc91d6381a3 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -51,7 +51,7 @@ struct nfs_access_entry { struct list_head lru; unsigned long jiffies; struct rpc_cred * cred; - int mask; + __u32 mask; struct rcu_head rcu_head; }; -- cgit v1.2.3-71-gd317 From 96edd61dcf44362d3ef0bed1a5361e0ac7886a63 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 10 Jul 2017 10:10:45 +0200 Subject: xen/balloon: don't online new memory initially When setting up the Xenstore watch for the memory target size the new watch will fire at once. Don't try to reach the configured target size by onlining new memory in this case, as the current memory size will be smaller in almost all cases due to e.g. BIOS reserved pages. Onlining new memory will lead to more problems e.g. undesired conflicts with NVMe devices meant to be operated as block devices. Instead remember the difference between target size and current size when the watch fires for the first time and apply it to any further size changes, too. In order to avoid races between balloon.c and xen-balloon.c init calls do the xen-balloon.c initialization from balloon.c. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- drivers/xen/balloon.c | 3 +++ drivers/xen/xen-balloon.c | 22 ++++++++++++---------- include/xen/balloon.h | 8 ++++++++ 3 files changed, 23 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 50dcb68d8070..ab609255a0f3 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -780,6 +780,9 @@ static int __init balloon_init(void) } #endif + /* Init the xen-balloon driver. */ + xen_balloon_init(); + return 0; } subsys_initcall(balloon_init); diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index e7715cb62eef..e89136ab851e 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -59,6 +59,8 @@ static void watch_target(struct xenbus_watch *watch, { unsigned long long new_target; int err; + static bool watch_fired; + static long target_diff; err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); if (err != 1) { @@ -69,7 +71,14 @@ static void watch_target(struct xenbus_watch *watch, /* The given memory/target value is in KiB, so it needs converting to * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. */ - balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); + new_target >>= PAGE_SHIFT - 10; + if (watch_fired) { + balloon_set_new_target(new_target - target_diff); + return; + } + + watch_fired = true; + target_diff = new_target - balloon_stats.target_pages; } static struct xenbus_watch target_watch = { .node = "memory/target", @@ -94,22 +103,15 @@ static struct notifier_block xenstore_notifier = { .notifier_call = balloon_init_watcher, }; -static int __init balloon_init(void) +void xen_balloon_init(void) { - if (!xen_domain()) - return -ENODEV; - - pr_info("Initialising balloon driver\n"); - register_balloon(&balloon_dev); register_xen_selfballooning(&balloon_dev); register_xenstore_notifier(&xenstore_notifier); - - return 0; } -subsys_initcall(balloon_init); +EXPORT_SYMBOL_GPL(xen_balloon_init); #define BALLOON_SHOW(name, format, args...) \ static ssize_t show_##name(struct device *dev, \ diff --git a/include/xen/balloon.h b/include/xen/balloon.h index d1767dfb0d95..8906361bb50c 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -35,3 +35,11 @@ static inline int register_xen_selfballooning(struct device *dev) return -ENOSYS; } #endif + +#ifdef CONFIG_XEN_BALLOON +void xen_balloon_init(void); +#else +static inline void xen_balloon_init(void) +{ +} +#endif -- cgit v1.2.3-71-gd317 From 832e4c83abc5ec25af77db6c8a0f36d78f1cf825 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 May 2017 09:16:24 +0200 Subject: uuid: remove uuid_be Everything uses uuid_t now. Signed-off-by: Christoph Hellwig Reviewed-by: Amir Goldstein Reviewed-by: Andy Shevchenko --- include/linux/uuid.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/uuid.h b/include/linux/uuid.h index 2251e1925ea4..33b0bdbb613c 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -84,26 +84,12 @@ int guid_parse(const char *uuid, guid_t *u); int uuid_parse(const char *uuid, uuid_t *u); /* backwards compatibility, don't use in new code */ -typedef uuid_t uuid_be; -#define UUID_BE(a, _b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ - UUID_INIT(a, _b, c, d0, d1, d2, d3, d4, d5, d6, d7) -#define NULL_UUID_BE \ - UUID_BE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00) - #define uuid_le_gen(u) guid_gen(u) -#define uuid_be_gen(u) uuid_gen(u) #define uuid_le_to_bin(guid, u) guid_parse(guid, u) -#define uuid_be_to_bin(uuid, u) uuid_parse(uuid, u) static inline int uuid_le_cmp(const guid_t u1, const guid_t u2) { return memcmp(&u1, &u2, sizeof(guid_t)); } -static inline int uuid_be_cmp(const uuid_t u1, const uuid_t u2) -{ - return memcmp(&u1, &u2, sizeof(uuid_t)); -} - #endif -- cgit v1.2.3-71-gd317 From a0c2d9c1de0f6be33fe817069b06663277153c20 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 21 Jul 2017 16:51:23 -0600 Subject: ACPI: NUMA: add missing include in acpi_numa.h Right now if a file includes acpi_numa.h and they don't happen to include linux/numa.h before it, they get the following warning: ./include/acpi/acpi_numa.h:9:5: warning: "MAX_NUMNODES" is not defined [-Wundef] #if MAX_NUMNODES > 256 ^~~~~~~~~~~~ Signed-off-by: Ross Zwisler Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_numa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index d4b72944ccda..1e3a74f94131 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -3,6 +3,7 @@ #ifdef CONFIG_ACPI_NUMA #include +#include /* Proximity bitmap length */ #if MAX_NUMNODES > 256 -- cgit v1.2.3-71-gd317 From 6c423f5751b9f68bfe7c7545519d4c7159f93e1b Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 24 Jul 2017 13:58:00 -0600 Subject: sched/wait: Clean up some documentation warnings A couple of kerneldoc comments in had incorrect names for macro parameters, with this unsightly result: ./include/linux/wait.h:555: warning: No description found for parameter 'wq' ./include/linux/wait.h:555: warning: Excess function parameter 'wq_head' description in 'wait_event_interruptible_hrtimeout' ./include/linux/wait.h:759: warning: No description found for parameter 'wq_head' ./include/linux/wait.h:759: warning: Excess function parameter 'wq' description in 'wait_event_killable' Correct the comments and kill the warnings. Signed-off-by: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20170724135800.769c4042@lwn.net Signed-off-by: Ingo Molnar --- include/linux/wait.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index b289c96151ee..5b74e36c0ca8 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -529,13 +529,13 @@ do { \ /** * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses - * @wq_head: the waitqueue to wait on + * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. - * The @condition is checked each time the waitqueue @wq_head is woken up. + * The @condition is checked each time the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -735,12 +735,12 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); /** * wait_event_killable - sleep until a condition gets true - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a signal is received. - * The @condition is checked each time the waitqueue @wq is woken up. + * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. -- cgit v1.2.3-71-gd317 From d9f89b4e9290e46cd9b273e9ad0bff0f93e86fae Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Sat, 1 Jul 2017 18:26:54 +0200 Subject: KVM: arm/arm64: PMU: Fix overflow interrupt injection kvm_pmu_overflow_set() is called from perf's interrupt handler, making the call of kvm_vgic_inject_irq() from it introduced with "KVM: arm/arm64: PMU: remove request-less vcpu kick" a really bad idea, as it's quite easy to try and retake a lock that the interrupted context is already holding. The fix is to use a vcpu kick, leaving the interrupt injection to kvm_pmu_sync_hwstate(), like it was doing before the refactoring. We don't just revert, though, because before the kick was request-less, leaving the vcpu exposed to the request-less vcpu kick race, and also because the kick was used unnecessarily from register access handlers. Reviewed-by: Christoffer Dall Signed-off-by: Andrew Jones Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 2 +- include/kvm/arm_pmu.h | 2 -- virt/kvm/arm/pmu.c | 43 +++++++++++++++---------------------------- 3 files changed, 16 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 77862881ae86..2e070d3baf9f 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -764,7 +764,7 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) { if (r->CRm & 0x2) /* accessing PMOVSSET_EL0 */ - kvm_pmu_overflow_set(vcpu, p->regval & mask); + vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask); else /* accessing PMOVSCLR_EL0 */ vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index f6e030617467..f87fe20fcb05 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -48,7 +48,6 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val); -void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu); @@ -86,7 +85,6 @@ static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val) {} -static inline void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {} static inline bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index fc8a723ff387..8a9c42366db7 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -203,11 +203,15 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) return reg; } -static void kvm_pmu_check_overflow(struct kvm_vcpu *vcpu) +static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; - bool overflow = !!kvm_pmu_overflow_status(vcpu); + bool overflow; + + if (!kvm_arm_pmu_v3_ready(vcpu)) + return; + overflow = !!kvm_pmu_overflow_status(vcpu); if (pmu->irq_level == overflow) return; @@ -215,33 +219,11 @@ static void kvm_pmu_check_overflow(struct kvm_vcpu *vcpu) if (likely(irqchip_in_kernel(vcpu->kvm))) { int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - pmu->irq_num, overflow, - &vcpu->arch.pmu); + pmu->irq_num, overflow, pmu); WARN_ON(ret); } } -/** - * kvm_pmu_overflow_set - set PMU overflow interrupt - * @vcpu: The vcpu pointer - * @val: the value guest writes to PMOVSSET register - */ -void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val) -{ - if (val == 0) - return; - - vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= val; - kvm_pmu_check_overflow(vcpu); -} - -static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) -{ - if (!kvm_arm_pmu_v3_ready(vcpu)) - return; - kvm_pmu_check_overflow(vcpu); -} - bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; @@ -303,7 +285,7 @@ static inline struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc) } /** - * When perf event overflows, call kvm_pmu_overflow_set to set overflow status. + * When the perf event overflows, set the overflow status and inform the vcpu. */ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, struct perf_sample_data *data, @@ -313,7 +295,12 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); int idx = pmc->idx; - kvm_pmu_overflow_set(vcpu, BIT(idx)); + vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); + + if (kvm_pmu_overflow_status(vcpu)) { + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + } } /** @@ -341,7 +328,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) reg = lower_32_bits(reg); vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; if (!reg) - kvm_pmu_overflow_set(vcpu, BIT(i)); + vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); } } } -- cgit v1.2.3-71-gd317 From 2fd4167fadd1360ab015e4f0e88e51843e49556c Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 12 Jul 2017 10:58:19 -0600 Subject: nvme: fabrics commands should use the fctype field for data direction Fabrics commands with opcode 0x7F use the fctype field to indicate data direction. Signed-off-by: Jon Derrick Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Fixes: eb793e2c ("nvme.h: add NVMe over Fabrics definitions") --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bc74da018bdc..25d8225dbd04 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1006,7 +1006,7 @@ static inline bool nvme_is_write(struct nvme_command *cmd) * Why can't we simply have a Fabrics In and Fabrics out command? */ if (unlikely(cmd->common.opcode == nvme_fabrics_command)) - return cmd->fabrics.opcode & 1; + return cmd->fabrics.fctype & 1; return cmd->common.opcode & 1; } -- cgit v1.2.3-71-gd317 From 9c5358e15ca12ed3dc3b1e51671dee5d155de8e0 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 17 Jul 2017 13:59:39 -0700 Subject: nvme-fc: revise TRADDR parsing The FC-NVME spec hasn't locked down on the format string for TRADDR. Currently the spec is lobbying for "nn-<16hexdigits>:pn-<16hexdigits>" where the wwn's are hex values but not prefixed by 0x. Most implementations so far expect a string format of "nn-0x<16hexdigits>:pn-0x<16hexdigits>" to be used. The transport uses the match_u64 parser which requires a leading 0x prefix to set the base properly. If it's not there, a match will either fail or return a base 10 value. The resolution in T11 is pushing out. Therefore, to fix things now and to cover any eventuality and any implementations already in the field, this patch adds support for both formats. The change consists of replacing the token matching routine with a routine that validates the fixed string format, and then builds a local copy of the hex name with a 0x prefix before calling the system parser. Note: the same parser routine exists in both the initiator and target transports. Given this is about the only "shared" item, we chose to replicate rather than create an interdendency on some shared code. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 102 ++++++++++++++++++++++++----------------------- drivers/nvme/target/fc.c | 101 ++++++++++++++++++++++++---------------------- include/linux/nvme-fc.h | 19 +++++++++ 3 files changed, 125 insertions(+), 97 deletions(-) (limited to 'include') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5630ca46c3b5..5c2a08ef08ba 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2805,66 +2805,70 @@ out_fail: return ERR_PTR(ret); } -enum { - FCT_TRADDR_ERR = 0, - FCT_TRADDR_WWNN = 1 << 0, - FCT_TRADDR_WWPN = 1 << 1, -}; struct nvmet_fc_traddr { u64 nn; u64 pn; }; -static const match_table_t traddr_opt_tokens = { - { FCT_TRADDR_WWNN, "nn-%s" }, - { FCT_TRADDR_WWPN, "pn-%s" }, - { FCT_TRADDR_ERR, NULL } -}; - static int -nvme_fc_parse_address(struct nvmet_fc_traddr *traddr, char *buf) +__nvme_fc_parse_u64(substring_t *sstr, u64 *val) { - substring_t args[MAX_OPT_ARGS]; - char *options, *o, *p; - int token, ret = 0; u64 token64; - options = o = kstrdup(buf, GFP_KERNEL); - if (!options) - return -ENOMEM; + if (match_u64(sstr, &token64)) + return -EINVAL; + *val = token64; - while ((p = strsep(&o, ":\n")) != NULL) { - if (!*p) - continue; + return 0; +} - token = match_token(p, traddr_opt_tokens, args); - switch (token) { - case FCT_TRADDR_WWNN: - if (match_u64(args, &token64)) { - ret = -EINVAL; - goto out; - } - traddr->nn = token64; - break; - case FCT_TRADDR_WWPN: - if (match_u64(args, &token64)) { - ret = -EINVAL; - goto out; - } - traddr->pn = token64; - break; - default: - pr_warn("unknown traddr token or missing value '%s'\n", - p); - ret = -EINVAL; - goto out; - } - } +/* + * This routine validates and extracts the WWN's from the TRADDR string. + * As kernel parsers need the 0x to determine number base, universally + * build string to parse with 0x prefix before parsing name strings. + */ +static int +nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen) +{ + char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1]; + substring_t wwn = { name, &name[sizeof(name)-1] }; + int nnoffset, pnoffset; + + /* validate it string one of the 2 allowed formats */ + if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH && + !strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET], + "pn-0x", NVME_FC_TRADDR_OXNNLEN)) { + nnoffset = NVME_FC_TRADDR_OXNNLEN; + pnoffset = NVME_FC_TRADDR_MAX_PN_OFFSET + + NVME_FC_TRADDR_OXNNLEN; + } else if ((strnlen(buf, blen) == NVME_FC_TRADDR_MINLENGTH && + !strncmp(buf, "nn-", NVME_FC_TRADDR_NNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MIN_PN_OFFSET], + "pn-", NVME_FC_TRADDR_NNLEN))) { + nnoffset = NVME_FC_TRADDR_NNLEN; + pnoffset = NVME_FC_TRADDR_MIN_PN_OFFSET + NVME_FC_TRADDR_NNLEN; + } else + goto out_einval; -out: - kfree(options); - return ret; + name[0] = '0'; + name[1] = 'x'; + name[2 + NVME_FC_TRADDR_HEXNAMELEN] = 0; + + memcpy(&name[2], &buf[nnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->nn)) + goto out_einval; + + memcpy(&name[2], &buf[pnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->pn)) + goto out_einval; + + return 0; + +out_einval: + pr_warn("%s: bad traddr string\n", __func__); + return -EINVAL; } static struct nvme_ctrl * @@ -2878,11 +2882,11 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) unsigned long flags; int ret; - ret = nvme_fc_parse_address(&raddr, opts->traddr); + ret = nvme_fc_parse_traddr(&raddr, opts->traddr, NVMF_TRADDR_SIZE); if (ret || !raddr.nn || !raddr.pn) return ERR_PTR(-EINVAL); - ret = nvme_fc_parse_address(&laddr, opts->host_traddr); + ret = nvme_fc_parse_traddr(&laddr, opts->host_traddr, NVMF_TRADDR_SIZE); if (ret || !laddr.nn || !laddr.pn) return ERR_PTR(-EINVAL); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index d5801c150b1c..31ca55dfcb1d 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2293,66 +2293,70 @@ nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *target_port, } EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_abort); -enum { - FCT_TRADDR_ERR = 0, - FCT_TRADDR_WWNN = 1 << 0, - FCT_TRADDR_WWPN = 1 << 1, -}; struct nvmet_fc_traddr { u64 nn; u64 pn; }; -static const match_table_t traddr_opt_tokens = { - { FCT_TRADDR_WWNN, "nn-%s" }, - { FCT_TRADDR_WWPN, "pn-%s" }, - { FCT_TRADDR_ERR, NULL } -}; - static int -nvmet_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf) +__nvme_fc_parse_u64(substring_t *sstr, u64 *val) { - substring_t args[MAX_OPT_ARGS]; - char *options, *o, *p; - int token, ret = 0; u64 token64; - options = o = kstrdup(buf, GFP_KERNEL); - if (!options) - return -ENOMEM; + if (match_u64(sstr, &token64)) + return -EINVAL; + *val = token64; - while ((p = strsep(&o, ":\n")) != NULL) { - if (!*p) - continue; + return 0; +} - token = match_token(p, traddr_opt_tokens, args); - switch (token) { - case FCT_TRADDR_WWNN: - if (match_u64(args, &token64)) { - ret = -EINVAL; - goto out; - } - traddr->nn = token64; - break; - case FCT_TRADDR_WWPN: - if (match_u64(args, &token64)) { - ret = -EINVAL; - goto out; - } - traddr->pn = token64; - break; - default: - pr_warn("unknown traddr token or missing value '%s'\n", - p); - ret = -EINVAL; - goto out; - } - } +/* + * This routine validates and extracts the WWN's from the TRADDR string. + * As kernel parsers need the 0x to determine number base, universally + * build string to parse with 0x prefix before parsing name strings. + */ +static int +nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen) +{ + char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1]; + substring_t wwn = { name, &name[sizeof(name)-1] }; + int nnoffset, pnoffset; + + /* validate it string one of the 2 allowed formats */ + if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH && + !strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET], + "pn-0x", NVME_FC_TRADDR_OXNNLEN)) { + nnoffset = NVME_FC_TRADDR_OXNNLEN; + pnoffset = NVME_FC_TRADDR_MAX_PN_OFFSET + + NVME_FC_TRADDR_OXNNLEN; + } else if ((strnlen(buf, blen) == NVME_FC_TRADDR_MINLENGTH && + !strncmp(buf, "nn-", NVME_FC_TRADDR_NNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MIN_PN_OFFSET], + "pn-", NVME_FC_TRADDR_NNLEN))) { + nnoffset = NVME_FC_TRADDR_NNLEN; + pnoffset = NVME_FC_TRADDR_MIN_PN_OFFSET + NVME_FC_TRADDR_NNLEN; + } else + goto out_einval; + + name[0] = '0'; + name[1] = 'x'; + name[2 + NVME_FC_TRADDR_HEXNAMELEN] = 0; + + memcpy(&name[2], &buf[nnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->nn)) + goto out_einval; + + memcpy(&name[2], &buf[pnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->pn)) + goto out_einval; -out: - kfree(options); - return ret; + return 0; + +out_einval: + pr_warn("%s: bad traddr string\n", __func__); + return -EINVAL; } static int @@ -2370,7 +2374,8 @@ nvmet_fc_add_port(struct nvmet_port *port) /* map the traddr address info to a target port */ - ret = nvmet_fc_parse_traddr(&traddr, port->disc_addr.traddr); + ret = nvme_fc_parse_traddr(&traddr, port->disc_addr.traddr, + sizeof(port->disc_addr.traddr)); if (ret) return ret; diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index 21c37e39e41a..36cca93a5ff2 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -334,5 +334,24 @@ struct fcnvme_ls_disconnect_acc { #define NVME_FC_LS_TIMEOUT_SEC 2 /* 2 seconds */ #define NVME_FC_TGTOP_TIMEOUT_SEC 2 /* 2 seconds */ +/* + * TRADDR string must be of form "nn-<16hexdigits>:pn-<16hexdigits>" + * the string is allowed to be specified with or without a "0x" prefix + * infront of the <16hexdigits>. Without is considered the "min" string + * and with is considered the "max" string. The hexdigits may be upper + * or lower case. + */ +#define NVME_FC_TRADDR_NNLEN 3 /* "?n-" */ +#define NVME_FC_TRADDR_OXNNLEN 5 /* "?n-0x" */ +#define NVME_FC_TRADDR_HEXNAMELEN 16 +#define NVME_FC_TRADDR_MINLENGTH \ + (2 * (NVME_FC_TRADDR_NNLEN + NVME_FC_TRADDR_HEXNAMELEN) + 1) +#define NVME_FC_TRADDR_MAXLENGTH \ + (2 * (NVME_FC_TRADDR_OXNNLEN + NVME_FC_TRADDR_HEXNAMELEN) + 1) +#define NVME_FC_TRADDR_MIN_PN_OFFSET \ + (NVME_FC_TRADDR_NNLEN + NVME_FC_TRADDR_HEXNAMELEN + 1) +#define NVME_FC_TRADDR_MAX_PN_OFFSET \ + (NVME_FC_TRADDR_OXNNLEN + NVME_FC_TRADDR_HEXNAMELEN + 1) + #endif /* _NVME_FC_H */ -- cgit v1.2.3-71-gd317 From dce4551cb2adb1ac9a30f8ab5299d614392b3cff Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 25 Jul 2017 17:57:47 +0200 Subject: udp: preserve head state for IP_CMSG_PASSSEC Paul Moore reported a SELinux/IP_PASSSEC regression caused by missing skb->sp at recvmsg() time. We need to preserve the skb head state to process the IP_CMSG_PASSSEC cmsg. With this commit we avoid releasing the skb head state in the BH even if a secpath is attached to the current skb, and stores the skb status (with/without head states) in the scratch area, so that we can access it at skb deallocation time, without incurring in cache-miss penalties. This also avoids misusing the skb CB for ipv6 packets, as introduced by the commit 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing"). Clean a bit the scratch area helpers implementation, to reduce the code differences between 32 and 64 bits build. Reported-by: Paul Moore Fixes: 0a463c78d25b ("udp: avoid a cache miss on dequeue") Fixes: 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing") Signed-off-by: Paolo Abeni Tested-by: Paul Moore Signed-off-by: David S. Miller --- include/net/udp.h | 33 ++++++++++++++++++++++----------- net/ipv4/udp.c | 38 ++++++++++++++++++-------------------- 2 files changed, 40 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 972ce4baab6b..56ce2d2a612d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -305,33 +305,44 @@ struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() */ -#if BITS_PER_LONG == 64 - -/* truesize, len and the bit needed to compute skb_csum_unnecessary will be on - * cold cache lines at recvmsg time. - * skb->len can be stored on 16 bits since the udp header has been already - * validated and pulled. - */ struct udp_dev_scratch { - u32 truesize; + /* skb->truesize and the stateless bit are embedded in a single field; + * do not use a bitfield since the compiler emits better/smaller code + * this way + */ + u32 _tsize_state; + +#if BITS_PER_LONG == 64 + /* len and the bit needed to compute skb_csum_unnecessary + * will be on cold cache lines at recvmsg time. + * skb->len can be stored on 16 bits since the udp header has been + * already validated and pulled. + */ u16 len; bool is_linear; bool csum_unnecessary; +#endif }; +static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb) +{ + return (struct udp_dev_scratch *)&skb->dev_scratch; +} + +#if BITS_PER_LONG == 64 static inline unsigned int udp_skb_len(struct sk_buff *skb) { - return ((struct udp_dev_scratch *)&skb->dev_scratch)->len; + return udp_skb_scratch(skb)->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { - return ((struct udp_dev_scratch *)&skb->dev_scratch)->csum_unnecessary; + return udp_skb_scratch(skb)->csum_unnecessary; } static inline bool udp_skb_is_linear(struct sk_buff *skb) { - return ((struct udp_dev_scratch *)&skb->dev_scratch)->is_linear; + return udp_skb_scratch(skb)->is_linear; } #else diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b057653ceca9..d243772f6efc 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1163,34 +1163,32 @@ out: return ret; } -#if BITS_PER_LONG == 64 +#define UDP_SKB_IS_STATELESS 0x80000000 + static void udp_set_dev_scratch(struct sk_buff *skb) { - struct udp_dev_scratch *scratch; + struct udp_dev_scratch *scratch = udp_skb_scratch(skb); BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long)); - scratch = (struct udp_dev_scratch *)&skb->dev_scratch; - scratch->truesize = skb->truesize; + scratch->_tsize_state = skb->truesize; +#if BITS_PER_LONG == 64 scratch->len = skb->len; scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); +#endif + if (likely(!skb->_skb_refdst)) + scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } static int udp_skb_truesize(struct sk_buff *skb) { - return ((struct udp_dev_scratch *)&skb->dev_scratch)->truesize; -} -#else -static void udp_set_dev_scratch(struct sk_buff *skb) -{ - skb->dev_scratch = skb->truesize; + return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; } -static int udp_skb_truesize(struct sk_buff *skb) +static bool udp_skb_has_head_state(struct sk_buff *skb) { - return skb->dev_scratch; + return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS); } -#endif /* fully reclaim rmem/fwd memory allocated for skb */ static void udp_rmem_release(struct sock *sk, int size, int partial, @@ -1388,10 +1386,10 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) unlock_sock_fast(sk, slow); } - /* we cleared the head states previously only if the skb lacks any IP - * options, see __udp_queue_rcv_skb(). + /* In the more common cases we cleared the head states previously, + * see __udp_queue_rcv_skb(). */ - if (unlikely(IPCB(skb)->opt.optlen > 0)) + if (unlikely(udp_skb_has_head_state(skb))) skb_release_head_state(skb); consume_stateless_skb(skb); } @@ -1784,11 +1782,11 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id_once(sk, skb); } - /* At recvmsg() time we need skb->dst to process IP options-related - * cmsg, elsewhere can we clear all pending head states while they are - * hot in the cache + /* At recvmsg() time we may access skb->dst or skb->sp depending on + * the IP options and the cmsg flags, elsewhere can we clear all + * pending head states while they are hot in the cache */ - if (likely(IPCB(skb)->opt.optlen == 0)) + if (likely(IPCB(skb)->opt.optlen == 0 && !skb->sp)) skb_release_head_state(skb); rc = __udp_enqueue_schedule_skb(sk, skb); -- cgit v1.2.3-71-gd317 From 0a94efb5acbb6980d7c9ab604372d93cd507e4d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jul 2017 08:36:15 -0400 Subject: workqueue: implicit ordered attribute should be overridable 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") automatically enabled ordered attribute for unbound workqueues w/ max_active == 1. Because ordered workqueues reject max_active and some attribute changes, this implicit ordered mode broke cases where the user creates an unbound workqueue w/ max_active == 1 and later explicitly changes the related attributes. This patch distinguishes explicit and implicit ordered setting and overrides from attribute changes if implict. Signed-off-by: Tejun Heo Fixes: 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") --- include/linux/workqueue.h | 4 +++- kernel/workqueue.c | 13 +++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index c102ef65cb64..db6dc9dc0482 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -323,6 +323,7 @@ enum { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ + __WQ_ORDERED_EXPLICIT = 1 << 18, /* internal: alloc_ordered_workqueue() */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ @@ -422,7 +423,8 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ - alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) + alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \ + __WQ_ORDERED_EXPLICIT | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index abe4a4971c24..7146ea70a62d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3744,8 +3744,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; /* creating multiple pwqs breaks ordering guarantee */ - if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) - return -EINVAL; + if (!list_empty(&wq->pwqs)) { + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + return -EINVAL; + + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, attrs); if (!ctx) @@ -4129,13 +4133,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) struct pool_workqueue *pwq; /* disallow meddling with max_active for ordered workqueues */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); + wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; for_each_pwq(pwq, wq) @@ -5263,7 +5268,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); -- cgit v1.2.3-71-gd317 From 2eaa38d9fcba5294182268b8d11770cf3fdc9bc9 Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Tue, 25 Jul 2017 11:08:15 +0200 Subject: net: phy: Remove trailing semicolon in macro definition Commit e5a03bfd873c2 ("phy: Add an mdio_device structure") introduced a spurious trailing semicolon. Remove it. Signed-off-by: Marc Gonzalez Signed-off-by: David S. Miller --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2a9567bb8186..0bb5b212ab42 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -830,7 +830,7 @@ static inline int phy_read_status(struct phy_device *phydev) dev_err(&_phydev->mdio.dev, format, ##args) #define phydev_dbg(_phydev, format, args...) \ - dev_dbg(&_phydev->mdio.dev, format, ##args); + dev_dbg(&_phydev->mdio.dev, format, ##args) static inline const char *phydev_name(const struct phy_device *phydev) { -- cgit v1.2.3-71-gd317 From fc1ff45a07abf240aa0c6586c11465c86c8cab8d Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sat, 15 Jul 2017 09:32:56 -0300 Subject: media: cec-notifier: small improvements Allow calling cec_notifier_set_phys_addr and cec_notifier_set_phys_addr_from_edid with a NULL notifier, in which case these functions do nothing. Add a cec_notifier_phys_addr_invalidate helper function (the notifier equivalent of cec_phys_addr_invalidate). These changes simplify drm CEC driver support. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/cec-notifier.c | 6 ++++++ include/media/cec-notifier.h | 15 +++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/drivers/media/cec/cec-notifier.c b/drivers/media/cec/cec-notifier.c index 74dc1c32080e..08b619d0ea1e 100644 --- a/drivers/media/cec/cec-notifier.c +++ b/drivers/media/cec/cec-notifier.c @@ -87,6 +87,9 @@ EXPORT_SYMBOL_GPL(cec_notifier_put); void cec_notifier_set_phys_addr(struct cec_notifier *n, u16 pa) { + if (n == NULL) + return; + mutex_lock(&n->lock); n->phys_addr = pa; if (n->callback) @@ -100,6 +103,9 @@ void cec_notifier_set_phys_addr_from_edid(struct cec_notifier *n, { u16 pa = CEC_PHYS_ADDR_INVALID; + if (n == NULL) + return; + if (edid && edid->extensions) pa = cec_get_edid_phys_addr((const u8 *)edid, EDID_LENGTH * (edid->extensions + 1), NULL); diff --git a/include/media/cec-notifier.h b/include/media/cec-notifier.h index 298f996969df..a4f7429c4ae5 100644 --- a/include/media/cec-notifier.h +++ b/include/media/cec-notifier.h @@ -57,6 +57,7 @@ void cec_notifier_put(struct cec_notifier *n); * @pa: the CEC physical address * * Set a new CEC physical address. + * Does nothing if @n == NULL. */ void cec_notifier_set_phys_addr(struct cec_notifier *n, u16 pa); @@ -66,6 +67,7 @@ void cec_notifier_set_phys_addr(struct cec_notifier *n, u16 pa); * @edid: the struct edid pointer * * Parses the EDID to obtain the new CEC physical address and set it. + * Does nothing if @n == NULL. */ void cec_notifier_set_phys_addr_from_edid(struct cec_notifier *n, const struct edid *edid); @@ -118,4 +120,17 @@ static inline void cec_notifier_unregister(struct cec_notifier *n) #endif +/** + * cec_notifier_phys_addr_invalidate() - set the physical address to INVALID + * + * @n: the CEC notifier + * + * This is a simple helper function to invalidate the physical + * address. Does nothing if @n == NULL. + */ +static inline void cec_notifier_phys_addr_invalidate(struct cec_notifier *n) +{ + cec_notifier_set_phys_addr(n, CEC_PHYS_ADDR_INVALID); +} + #endif -- cgit v1.2.3-71-gd317 From b25db383928cecba356835583b16fa7008f97b3a Mon Sep 17 00:00:00 2001 From: Prabhakar Lad Date: Thu, 20 Jul 2017 04:56:31 -0400 Subject: media: platform: davinci: drop VPFE_CMD_S_CCDC_RAW_PARAMS drop VPFE_CMD_S_CCDC_RAW_PARAMS ioctl from dm355/dm644x following reasons: - This ioctl was never in public api and was only defined in kernel header. - The function set_params constantly mixes up pointers and phys_addr_t numbers. - This is part of a 'VPFE_CMD_S_CCDC_RAW_PARAMS' ioctl command that is described as an 'experimental ioctl that will change in future kernels'. - The code to allocate the table never gets called after we copy_from_user the user input over the kernel settings, and then compare them for inequality. - We then go on to use an address provided by user space as both the __user pointer for input and pass it through phys_to_virt to come up with a kernel pointer to copy the data to. This looks like a trivially exploitable root hole. Signed-off-by: Lad, Prabhakar Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/davinci/ccdc_hw_device.h | 10 -- drivers/media/platform/davinci/dm355_ccdc.c | 92 +-------------- drivers/media/platform/davinci/dm644x_ccdc.c | 151 +----------------------- drivers/media/platform/davinci/vpfe_capture.c | 75 ------------ include/media/davinci/dm644x_ccdc.h | 12 -- include/media/davinci/vpfe_capture.h | 10 -- 6 files changed, 4 insertions(+), 346 deletions(-) (limited to 'include') diff --git a/drivers/media/platform/davinci/ccdc_hw_device.h b/drivers/media/platform/davinci/ccdc_hw_device.h index 8f6688a7a111..f1b521045d64 100644 --- a/drivers/media/platform/davinci/ccdc_hw_device.h +++ b/drivers/media/platform/davinci/ccdc_hw_device.h @@ -42,16 +42,6 @@ struct ccdc_hw_ops { int (*set_hw_if_params) (struct vpfe_hw_if_param *param); /* get interface parameters */ int (*get_hw_if_params) (struct vpfe_hw_if_param *param); - /* - * Pointer to function to set parameters. Used - * for implementing VPFE_S_CCDC_PARAMS - */ - int (*set_params) (void *params); - /* - * Pointer to function to get parameter. Used - * for implementing VPFE_G_CCDC_PARAMS - */ - int (*get_params) (void *params); /* Pointer to function to configure ccdc */ int (*configure) (void); diff --git a/drivers/media/platform/davinci/dm355_ccdc.c b/drivers/media/platform/davinci/dm355_ccdc.c index 73db166dc338..6d492dc4c3a9 100644 --- a/drivers/media/platform/davinci/dm355_ccdc.c +++ b/drivers/media/platform/davinci/dm355_ccdc.c @@ -17,12 +17,7 @@ * This module is for configuring DM355 CCD controller of VPFE to capture * Raw yuv or Bayer RGB data from a decoder. CCDC has several modules * such as Defect Pixel Correction, Color Space Conversion etc to - * pre-process the Bayer RGB data, before writing it to SDRAM. This - * module also allows application to configure individual - * module parameters through VPFE_CMD_S_CCDC_RAW_PARAMS IOCTL. - * To do so, application include dm355_ccdc.h and vpfe_capture.h header - * files. The setparams() API is called by vpfe_capture driver - * to configure module parameters + * pre-process the Bayer RGB data, before writing it to SDRAM. * * TODO: 1) Raw bayer parameter settings and bayer capture * 2) Split module parameter structure to module specific ioctl structs @@ -260,90 +255,6 @@ static void ccdc_setwin(struct v4l2_rect *image_win, dev_dbg(ccdc_cfg.dev, "\nEnd of ccdc_setwin..."); } -static int validate_ccdc_param(struct ccdc_config_params_raw *ccdcparam) -{ - if (ccdcparam->datasft < CCDC_DATA_NO_SHIFT || - ccdcparam->datasft > CCDC_DATA_SHIFT_6BIT) { - dev_dbg(ccdc_cfg.dev, "Invalid value of data shift\n"); - return -EINVAL; - } - - if (ccdcparam->mfilt1 < CCDC_NO_MEDIAN_FILTER1 || - ccdcparam->mfilt1 > CCDC_MEDIAN_FILTER1) { - dev_dbg(ccdc_cfg.dev, "Invalid value of median filter1\n"); - return -EINVAL; - } - - if (ccdcparam->mfilt2 < CCDC_NO_MEDIAN_FILTER2 || - ccdcparam->mfilt2 > CCDC_MEDIAN_FILTER2) { - dev_dbg(ccdc_cfg.dev, "Invalid value of median filter2\n"); - return -EINVAL; - } - - if ((ccdcparam->med_filt_thres < 0) || - (ccdcparam->med_filt_thres > CCDC_MED_FILT_THRESH)) { - dev_dbg(ccdc_cfg.dev, - "Invalid value of median filter threshold\n"); - return -EINVAL; - } - - if (ccdcparam->data_sz < CCDC_DATA_16BITS || - ccdcparam->data_sz > CCDC_DATA_8BITS) { - dev_dbg(ccdc_cfg.dev, "Invalid value of data size\n"); - return -EINVAL; - } - - if (ccdcparam->alaw.enable) { - if (ccdcparam->alaw.gamma_wd < CCDC_GAMMA_BITS_13_4 || - ccdcparam->alaw.gamma_wd > CCDC_GAMMA_BITS_09_0) { - dev_dbg(ccdc_cfg.dev, "Invalid value of ALAW\n"); - return -EINVAL; - } - } - - if (ccdcparam->blk_clamp.b_clamp_enable) { - if (ccdcparam->blk_clamp.sample_pixel < CCDC_SAMPLE_1PIXELS || - ccdcparam->blk_clamp.sample_pixel > CCDC_SAMPLE_16PIXELS) { - dev_dbg(ccdc_cfg.dev, - "Invalid value of sample pixel\n"); - return -EINVAL; - } - if (ccdcparam->blk_clamp.sample_ln < CCDC_SAMPLE_1LINES || - ccdcparam->blk_clamp.sample_ln > CCDC_SAMPLE_16LINES) { - dev_dbg(ccdc_cfg.dev, - "Invalid value of sample lines\n"); - return -EINVAL; - } - } - return 0; -} - -/* Parameter operations */ -static int ccdc_set_params(void __user *params) -{ - struct ccdc_config_params_raw ccdc_raw_params; - int x; - - /* only raw module parameters can be set through the IOCTL */ - if (ccdc_cfg.if_type != VPFE_RAW_BAYER) - return -EINVAL; - - x = copy_from_user(&ccdc_raw_params, params, sizeof(ccdc_raw_params)); - if (x) { - dev_dbg(ccdc_cfg.dev, "ccdc_set_params: error in copying ccdcparams, %d\n", - x); - return -EFAULT; - } - - if (!validate_ccdc_param(&ccdc_raw_params)) { - memcpy(&ccdc_cfg.bayer.config_params, - &ccdc_raw_params, - sizeof(ccdc_raw_params)); - return 0; - } - return -EINVAL; -} - /* This function will configure CCDC for YCbCr video capture */ static void ccdc_config_ycbcr(void) { @@ -939,7 +850,6 @@ static struct ccdc_hw_device ccdc_hw_dev = { .enable = ccdc_enable, .enable_out_to_sdram = ccdc_enable_output_to_sdram, .set_hw_if_params = ccdc_set_hw_if_params, - .set_params = ccdc_set_params, .configure = ccdc_configure, .set_buftype = ccdc_set_buftype, .get_buftype = ccdc_get_buftype, diff --git a/drivers/media/platform/davinci/dm644x_ccdc.c b/drivers/media/platform/davinci/dm644x_ccdc.c index 740fbc7a8c14..3b2d8a9317b8 100644 --- a/drivers/media/platform/davinci/dm644x_ccdc.c +++ b/drivers/media/platform/davinci/dm644x_ccdc.c @@ -17,13 +17,9 @@ * This module is for configuring CCD controller of DM6446 VPFE to capture * Raw yuv or Bayer RGB data from a decoder. CCDC has several modules * such as Defect Pixel Correction, Color Space Conversion etc to - * pre-process the Raw Bayer RGB data, before writing it to SDRAM. This - * module also allows application to configure individual - * module parameters through VPFE_CMD_S_CCDC_RAW_PARAMS IOCTL. - * To do so, application includes dm644x_ccdc.h and vpfe_capture.h header - * files. The setparams() API is called by vpfe_capture driver - * to configure module parameters. This file is named DM644x so that other - * variants such DM6443 may be supported using the same module. + * pre-process the Raw Bayer RGB data, before writing it to SDRAM. + * This file is named DM644x so that other variants such DM6443 + * may be supported using the same module. * * TODO: Test Raw bayer parameter settings and bayer capture * Split module parameter structure to module specific ioctl structs @@ -216,96 +212,8 @@ static void ccdc_readregs(void) dev_notice(ccdc_cfg.dev, "\nReading 0x%x to VERT_LINES...\n", val); } -static int validate_ccdc_param(struct ccdc_config_params_raw *ccdcparam) -{ - if (ccdcparam->alaw.enable) { - u8 max_gamma = ccdc_gamma_width_max_bit(ccdcparam->alaw.gamma_wd); - u8 max_data = ccdc_data_size_max_bit(ccdcparam->data_sz); - - if ((ccdcparam->alaw.gamma_wd > CCDC_GAMMA_BITS_09_0) || - (ccdcparam->alaw.gamma_wd < CCDC_GAMMA_BITS_15_6) || - (max_gamma > max_data)) { - dev_dbg(ccdc_cfg.dev, "\nInvalid data line select"); - return -1; - } - } - return 0; -} - -static int ccdc_update_raw_params(struct ccdc_config_params_raw *raw_params) -{ - struct ccdc_config_params_raw *config_params = - &ccdc_cfg.bayer.config_params; - unsigned int *fpc_virtaddr = NULL; - unsigned int *fpc_physaddr = NULL; - - memcpy(config_params, raw_params, sizeof(*raw_params)); - /* - * allocate memory for fault pixel table and copy the user - * values to the table - */ - if (!config_params->fault_pxl.enable) - return 0; - - fpc_physaddr = (unsigned int *)config_params->fault_pxl.fpc_table_addr; - fpc_virtaddr = (unsigned int *)phys_to_virt( - (unsigned long)fpc_physaddr); - /* - * Allocate memory for FPC table if current - * FPC table buffer is not big enough to - * accommodate FPC Number requested - */ - if (raw_params->fault_pxl.fp_num != config_params->fault_pxl.fp_num) { - if (fpc_physaddr != NULL) { - free_pages((unsigned long)fpc_virtaddr, - get_order - (config_params->fault_pxl.fp_num * - FP_NUM_BYTES)); - } - - /* Allocate memory for FPC table */ - fpc_virtaddr = - (unsigned int *)__get_free_pages(GFP_KERNEL | GFP_DMA, - get_order(raw_params-> - fault_pxl.fp_num * - FP_NUM_BYTES)); - - if (fpc_virtaddr == NULL) { - dev_dbg(ccdc_cfg.dev, - "\nUnable to allocate memory for FPC"); - return -EFAULT; - } - fpc_physaddr = - (unsigned int *)virt_to_phys((void *)fpc_virtaddr); - } - - /* Copy number of fault pixels and FPC table */ - config_params->fault_pxl.fp_num = raw_params->fault_pxl.fp_num; - if (copy_from_user(fpc_virtaddr, - (void __user *)raw_params->fault_pxl.fpc_table_addr, - config_params->fault_pxl.fp_num * FP_NUM_BYTES)) { - dev_dbg(ccdc_cfg.dev, "\n copy_from_user failed"); - return -EFAULT; - } - config_params->fault_pxl.fpc_table_addr = (unsigned long)fpc_physaddr; - return 0; -} - static int ccdc_close(struct device *dev) { - struct ccdc_config_params_raw *config_params = - &ccdc_cfg.bayer.config_params; - unsigned int *fpc_physaddr = NULL, *fpc_virtaddr = NULL; - - fpc_physaddr = (unsigned int *)config_params->fault_pxl.fpc_table_addr; - - if (fpc_physaddr != NULL) { - fpc_virtaddr = (unsigned int *) - phys_to_virt((unsigned long)fpc_physaddr); - free_pages((unsigned long)fpc_virtaddr, - get_order(config_params->fault_pxl.fp_num * - FP_NUM_BYTES)); - } return 0; } @@ -339,29 +247,6 @@ static void ccdc_sbl_reset(void) vpss_clear_wbl_overflow(VPSS_PCR_CCDC_WBL_O); } -/* Parameter operations */ -static int ccdc_set_params(void __user *params) -{ - struct ccdc_config_params_raw ccdc_raw_params; - int x; - - if (ccdc_cfg.if_type != VPFE_RAW_BAYER) - return -EINVAL; - - x = copy_from_user(&ccdc_raw_params, params, sizeof(ccdc_raw_params)); - if (x) { - dev_dbg(ccdc_cfg.dev, "ccdc_set_params: error in copyingccdc params, %d\n", - x); - return -EFAULT; - } - - if (!validate_ccdc_param(&ccdc_raw_params)) { - if (!ccdc_update_raw_params(&ccdc_raw_params)) - return 0; - } - return -EINVAL; -} - /* * ccdc_config_ycbcr() * This function will configure CCDC for YCbCr video capture @@ -489,32 +374,6 @@ static void ccdc_config_black_compense(struct ccdc_black_compensation *bcomp) regw(val, CCDC_BLKCMP); } -static void ccdc_config_fpc(struct ccdc_fault_pixel *fpc) -{ - u32 val; - - /* Initially disable FPC */ - val = CCDC_FPC_DISABLE; - regw(val, CCDC_FPC); - - if (!fpc->enable) - return; - - /* Configure Fault pixel if needed */ - regw(fpc->fpc_table_addr, CCDC_FPC_ADDR); - dev_dbg(ccdc_cfg.dev, "\nWriting 0x%lx to FPC_ADDR...\n", - (fpc->fpc_table_addr)); - /* Write the FPC params with FPC disable */ - val = fpc->fp_num & CCDC_FPC_FPC_NUM_MASK; - regw(val, CCDC_FPC); - - dev_dbg(ccdc_cfg.dev, "\nWriting 0x%x to FPC...\n", val); - /* read the FPC register */ - val = regr(CCDC_FPC) | CCDC_FPC_ENABLE; - regw(val, CCDC_FPC); - dev_dbg(ccdc_cfg.dev, "\nWriting 0x%x to FPC...\n", val); -} - /* * ccdc_config_raw() * This function will configure CCDC for Raw capture mode @@ -569,9 +428,6 @@ static void ccdc_config_raw(void) /* Configure Black level compensation */ ccdc_config_black_compense(&config_params->blk_comp); - /* Configure Fault Pixel Correction */ - ccdc_config_fpc(&config_params->fault_pxl); - /* If data size is 8 bit then pack the data */ if ((config_params->data_sz == CCDC_DATA_8BITS) || config_params->alaw.enable) @@ -929,7 +785,6 @@ static struct ccdc_hw_device ccdc_hw_dev = { .reset = ccdc_sbl_reset, .enable = ccdc_enable, .set_hw_if_params = ccdc_set_hw_if_params, - .set_params = ccdc_set_params, .configure = ccdc_configure, .set_buftype = ccdc_set_buftype, .get_buftype = ccdc_get_buftype, diff --git a/drivers/media/platform/davinci/vpfe_capture.c b/drivers/media/platform/davinci/vpfe_capture.c index 1831bf5ccca5..b1bf4a7e8eb7 100644 --- a/drivers/media/platform/davinci/vpfe_capture.c +++ b/drivers/media/platform/davinci/vpfe_capture.c @@ -280,45 +280,6 @@ void vpfe_unregister_ccdc_device(struct ccdc_hw_device *dev) } EXPORT_SYMBOL(vpfe_unregister_ccdc_device); -/* - * vpfe_get_ccdc_image_format - Get image parameters based on CCDC settings - */ -static int vpfe_get_ccdc_image_format(struct vpfe_device *vpfe_dev, - struct v4l2_format *f) -{ - struct v4l2_rect image_win; - enum ccdc_buftype buf_type; - enum ccdc_frmfmt frm_fmt; - - memset(f, 0, sizeof(*f)); - f->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; - ccdc_dev->hw_ops.get_image_window(&image_win); - f->fmt.pix.width = image_win.width; - f->fmt.pix.height = image_win.height; - f->fmt.pix.bytesperline = ccdc_dev->hw_ops.get_line_length(); - f->fmt.pix.sizeimage = f->fmt.pix.bytesperline * - f->fmt.pix.height; - buf_type = ccdc_dev->hw_ops.get_buftype(); - f->fmt.pix.pixelformat = ccdc_dev->hw_ops.get_pixel_format(); - frm_fmt = ccdc_dev->hw_ops.get_frame_format(); - if (frm_fmt == CCDC_FRMFMT_PROGRESSIVE) - f->fmt.pix.field = V4L2_FIELD_NONE; - else if (frm_fmt == CCDC_FRMFMT_INTERLACED) { - if (buf_type == CCDC_BUFTYPE_FLD_INTERLEAVED) - f->fmt.pix.field = V4L2_FIELD_INTERLACED; - else if (buf_type == CCDC_BUFTYPE_FLD_SEPARATED) - f->fmt.pix.field = V4L2_FIELD_SEQ_TB; - else { - v4l2_err(&vpfe_dev->v4l2_dev, "Invalid buf_type\n"); - return -EINVAL; - } - } else { - v4l2_err(&vpfe_dev->v4l2_dev, "Invalid frm_fmt\n"); - return -EINVAL; - } - return 0; -} - /* * vpfe_config_ccdc_image_format() * For a pix format, configure ccdc to setup the capture @@ -1697,41 +1658,6 @@ unlock_out: return ret; } - -static long vpfe_param_handler(struct file *file, void *priv, - bool valid_prio, unsigned int cmd, void *param) -{ - struct vpfe_device *vpfe_dev = video_drvdata(file); - int ret; - - v4l2_dbg(2, debug, &vpfe_dev->v4l2_dev, "vpfe_param_handler\n"); - - if (vpfe_dev->started) { - /* only allowed if streaming is not started */ - v4l2_dbg(1, debug, &vpfe_dev->v4l2_dev, - "device already started\n"); - return -EBUSY; - } - - ret = mutex_lock_interruptible(&vpfe_dev->lock); - if (ret) - return ret; - - switch (cmd) { - case VPFE_CMD_S_CCDC_RAW_PARAMS: - ret = -EINVAL; - v4l2_warn(&vpfe_dev->v4l2_dev, - "VPFE_CMD_S_CCDC_RAW_PARAMS not supported\n"); - break; - default: - ret = -ENOTTY; - } -unlock_out: - mutex_unlock(&vpfe_dev->lock); - return ret; -} - - /* vpfe capture ioctl operations */ static const struct v4l2_ioctl_ops vpfe_ioctl_ops = { .vidioc_querycap = vpfe_querycap, @@ -1754,7 +1680,6 @@ static const struct v4l2_ioctl_ops vpfe_ioctl_ops = { .vidioc_cropcap = vpfe_cropcap, .vidioc_g_selection = vpfe_g_selection, .vidioc_s_selection = vpfe_s_selection, - .vidioc_default = vpfe_param_handler, }; static struct vpfe_device *vpfe_initialize(void) diff --git a/include/media/davinci/dm644x_ccdc.h b/include/media/davinci/dm644x_ccdc.h index 7c909da29d43..6ea2ce241851 100644 --- a/include/media/davinci/dm644x_ccdc.h +++ b/include/media/davinci/dm644x_ccdc.h @@ -103,16 +103,6 @@ struct ccdc_black_compensation { char gb; }; -/* structure for fault pixel correction */ -struct ccdc_fault_pixel { - /* Enable or Disable fault pixel correction */ - unsigned char enable; - /* Number of fault pixel */ - unsigned short fp_num; - /* Address of fault pixel table */ - unsigned long fpc_table_addr; -}; - /* Structure for CCDC configuration parameters for raw capture mode passed * by application */ @@ -125,8 +115,6 @@ struct ccdc_config_params_raw { struct ccdc_black_clamp blk_clamp; /* Structure for Black Compensation */ struct ccdc_black_compensation blk_comp; - /* Structure for Fault Pixel Module Configuration */ - struct ccdc_fault_pixel fault_pxl; }; diff --git a/include/media/davinci/vpfe_capture.h b/include/media/davinci/vpfe_capture.h index 8e1a4d88daa0..f003533602d0 100644 --- a/include/media/davinci/vpfe_capture.h +++ b/include/media/davinci/vpfe_capture.h @@ -183,14 +183,4 @@ struct vpfe_config_params { }; #endif /* End of __KERNEL__ */ -/** - * VPFE_CMD_S_CCDC_RAW_PARAMS - EXPERIMENTAL IOCTL to set raw capture params - * This can be used to configure modules such as defect pixel correction, - * color space conversion, culling etc. This is an experimental ioctl that - * will change in future kernels. So use this ioctl with care ! - * TODO: This is to be split into multiple ioctls and also explore the - * possibility of extending the v4l2 api to include this - **/ -#define VPFE_CMD_S_CCDC_RAW_PARAMS _IOW('V', BASE_VIDIOC_PRIVATE + 1, \ - void *) #endif /* _DAVINCI_VPFE_H */ -- cgit v1.2.3-71-gd317 From fdeaf7e3eb37c6dbc4b4ac97dbe1945d239eb788 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Mon, 24 Jul 2017 13:40:03 +0200 Subject: KVM: make pid available for uevents without debugfs Simplify and improve the code so that the PID is always available in the uevent even when debugfs is not available. This adds a userspace_pid field to struct kvm, as per Radim's suggestion, so that the PID can be retrieved on destruction too. Acked-by: Janosch Frank Fixes: 286de8f6ac9202 ("KVM: trigger uevents when creating or destroying a VM") Signed-off-by: Claudio Imbrenda Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 35 ++++++++++++----------------------- 2 files changed, 13 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 648b34cabb38..890b706d1943 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -445,6 +445,7 @@ struct kvm { struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; struct srcu_struct irq_srcu; + pid_t userspace_pid; }; #define kvm_err(fmt, ...) \ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 82987d457b8b..f3f74271f1a9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3883,7 +3883,6 @@ static const struct file_operations *stat_fops[] = { static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) { struct kobj_uevent_env *env; - char *tmp, *pathbuf = NULL; unsigned long long created, active; if (!kvm_dev.this_device || !kvm) @@ -3907,38 +3906,28 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) add_uevent_var(env, "CREATED=%llu", created); add_uevent_var(env, "COUNT=%llu", active); - if (type == KVM_EVENT_CREATE_VM) + if (type == KVM_EVENT_CREATE_VM) { add_uevent_var(env, "EVENT=create"); - else if (type == KVM_EVENT_DESTROY_VM) + kvm->userspace_pid = task_pid_nr(current); + } else if (type == KVM_EVENT_DESTROY_VM) { add_uevent_var(env, "EVENT=destroy"); + } + add_uevent_var(env, "PID=%d", kvm->userspace_pid); if (kvm->debugfs_dentry) { - char p[ITOA_MAX_LEN]; - - snprintf(p, sizeof(p), "%s", kvm->debugfs_dentry->d_name.name); - tmp = strchrnul(p + 1, '-'); - *tmp = '\0'; - add_uevent_var(env, "PID=%s", p); - pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - if (pathbuf) { - /* sizeof counts the final '\0' */ - int len = sizeof("STATS_PATH=") - 1; - const char *pvar = "STATS_PATH="; - - tmp = dentry_path_raw(kvm->debugfs_dentry, - pathbuf + len, - PATH_MAX - len); - if (!IS_ERR(tmp)) { - memcpy(tmp - len, pvar, len); - env->envp[env->envp_idx++] = tmp - len; - } + char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); + + if (p) { + tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); + if (!IS_ERR(tmp)) + add_uevent_var(env, "STATS_PATH=%s", tmp); + kfree(p); } } /* no need for checks, since we are adding at most only 5 keys */ env->envp[env->envp_idx++] = NULL; kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); kfree(env); - kfree(pathbuf); } static int kvm_init_debug(void) -- cgit v1.2.3-71-gd317 From bb67b496c338e15813f075f482067da930f52e39 Mon Sep 17 00:00:00 2001 From: Murilo Opsfelder Araujo Date: Tue, 18 Jul 2017 14:22:20 -0300 Subject: include/linux/vfio.h: Guard powerpc-specific functions with CONFIG_VFIO_SPAPR_EEH When CONFIG_EEH=y and CONFIG_VFIO_SPAPR_EEH=n, build fails with the following: drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_release': vfio_pci.c:(.text+0xa98): undefined reference to `.vfio_spapr_pci_eeh_release' drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_open': vfio_pci.c:(.text+0x1420): undefined reference to `.vfio_spapr_pci_eeh_open' In this case, vfio_pci.c should use the empty definitions of vfio_spapr_pci_eeh_open and vfio_spapr_pci_eeh_release functions. This patch fixes it by guarding these function definitions with CONFIG_VFIO_SPAPR_EEH, the symbol that controls whether vfio_spapr_eeh.c is built, which is where the non-empty versions of these functions are. We need to make use of IS_ENABLED() macro because CONFIG_VFIO_SPAPR_EEH is a tristate option. This issue was found during a randconfig build. Logs are here: http://kisskb.ellerman.id.au/kisskb/buildresult/12982362/ Signed-off-by: Murilo Opsfelder Araujo Reviewed-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Alex Williamson --- include/linux/vfio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 586809abb273..a47b985341d1 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -152,7 +152,7 @@ extern int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, size_t *data_size); struct pci_dev; -#ifdef CONFIG_EEH +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev); extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev); extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, @@ -173,7 +173,7 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, { return -ENOTTY; } -#endif /* CONFIG_EEH */ +#endif /* CONFIG_VFIO_SPAPR_EEH */ /* * IRQfd - generic -- cgit v1.2.3-71-gd317 From 273752c9ff03eb83856601b2a3458218bb949e46 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 26 Jul 2017 09:35:09 -0400 Subject: dm, dax: Make sure dm_dax_flush() is called if device supports it Currently dm_dax_flush() is not being called, even if underlying dax device supports write cache, because DAXDEV_WRITE_CACHE is not being propagated up to the DM dax device. If the underlying dax device supports write cache, set DAXDEV_WRITE_CACHE on the DM dax device. This will cause dm_dax_flush() to be called. Fixes: abebfbe2f7 ("dm: add ->flush() dax operation support") Signed-off-by: Vivek Goyal Acked-by: Dan Williams Signed-off-by: Mike Snitzer --- drivers/dax/super.c | 6 ++++++ drivers/md/dm-table.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/dax.h | 1 + 3 files changed, 42 insertions(+) (limited to 'include') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index ce9e563e6e1d..938eb4868f7f 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -278,6 +278,12 @@ void dax_write_cache(struct dax_device *dax_dev, bool wc) } EXPORT_SYMBOL_GPL(dax_write_cache); +bool dax_write_cache_enabled(struct dax_device *dax_dev) +{ + return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(dax_write_cache_enabled); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index a39bcd9b982a..28a4071cdf85 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -20,6 +20,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "table" @@ -1630,6 +1631,37 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) return false; } +static int device_dax_write_cache_enabled(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct dax_device *dax_dev = dev->dax_dev; + + if (!dax_dev) + return false; + + if (dax_write_cache_enabled(dax_dev)) + return true; + return false; +} + +static int dm_table_supports_dax_write_cache(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, + device_dax_write_cache_enabled, NULL)) + return true; + } + + return false; +} + static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1785,6 +1817,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); + if (dm_table_supports_dax_write_cache(t)) + dax_write_cache(t->md->dax_dev, true); + /* Ensure that all underlying devices are non-rotational. */ if (dm_table_all_devices_attribute(t, device_is_nonrot)) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); diff --git a/include/linux/dax.h b/include/linux/dax.h index 794811875732..df97b7af7e2c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -87,6 +87,7 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t size); void dax_write_cache(struct dax_device *dax_dev, bool wc); +bool dax_write_cache_enabled(struct dax_device *dax_dev); /* * We use lowest available bit in exceptional entry for locking, one bit for -- cgit v1.2.3-71-gd317 From 1937f8a29f4a650bc27e0311b43b53509a34fd22 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Jul 2017 12:52:59 +0200 Subject: scsi: bnx2fc: Simplify CPU hotplug code The CPU hotplug related code of this driver can be simplified by: 1) Consolidating the callbacks into a single state. The CPU thread can be torn down on the CPU which goes offline. There is no point in delaying that to the CPU dead state 2) Let the core code invoke the online/offline callbacks and remove the extra for_each_online_cpu() loops. Signed-off-by: Thomas Gleixner Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 69 +++++++++------------------------------ include/linux/cpuhotplug.h | 1 - 2 files changed, 15 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index c4ebf8c5d884..6844ba361616 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -2624,12 +2624,11 @@ static struct fcoe_transport bnx2fc_transport = { }; /** - * bnx2fc_percpu_thread_create - Create a receive thread for an - * online CPU + * bnx2fc_cpu_online - Create a receive thread for an online CPU * * @cpu: cpu index for the online cpu */ -static void bnx2fc_percpu_thread_create(unsigned int cpu) +static int bnx2fc_cpu_online(unsigned int cpu) { struct bnx2fc_percpu_s *p; struct task_struct *thread; @@ -2639,15 +2638,17 @@ static void bnx2fc_percpu_thread_create(unsigned int cpu) thread = kthread_create_on_node(bnx2fc_percpu_io_thread, (void *)p, cpu_to_node(cpu), "bnx2fc_thread/%d", cpu); + if (IS_ERR(thread)) + return PTR_ERR(thread); + /* bind thread to the cpu */ - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - p->iothread = thread; - wake_up_process(thread); - } + kthread_bind(thread, cpu); + p->iothread = thread; + wake_up_process(thread); + return 0; } -static void bnx2fc_percpu_thread_destroy(unsigned int cpu) +static int bnx2fc_cpu_offline(unsigned int cpu) { struct bnx2fc_percpu_s *p; struct task_struct *thread; @@ -2661,7 +2662,6 @@ static void bnx2fc_percpu_thread_destroy(unsigned int cpu) thread = p->iothread; p->iothread = NULL; - /* Free all work in the list */ list_for_each_entry_safe(work, tmp, &p->work_list, list) { list_del_init(&work->list); @@ -2673,20 +2673,6 @@ static void bnx2fc_percpu_thread_destroy(unsigned int cpu) if (thread) kthread_stop(thread); -} - - -static int bnx2fc_cpu_online(unsigned int cpu) -{ - printk(PFX "CPU %x online: Create Rx thread\n", cpu); - bnx2fc_percpu_thread_create(cpu); - return 0; -} - -static int bnx2fc_cpu_dead(unsigned int cpu) -{ - printk(PFX "CPU %x offline: Remove Rx thread\n", cpu); - bnx2fc_percpu_thread_destroy(cpu); return 0; } @@ -2761,31 +2747,16 @@ static int __init bnx2fc_mod_init(void) spin_lock_init(&p->fp_work_lock); } - get_online_cpus(); - - for_each_online_cpu(cpu) - bnx2fc_percpu_thread_create(cpu); - - rc = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, - "scsi/bnx2fc:online", - bnx2fc_cpu_online, NULL); + rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "scsi/bnx2fc:online", + bnx2fc_cpu_online, bnx2fc_cpu_offline); if (rc < 0) - goto stop_threads; + goto stop_thread; bnx2fc_online_state = rc; - cpuhp_setup_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2FC_DEAD, - "scsi/bnx2fc:dead", - NULL, bnx2fc_cpu_dead); - put_online_cpus(); - cnic_register_driver(CNIC_ULP_FCOE, &bnx2fc_cnic_cb); - return 0; -stop_threads: - for_each_online_cpu(cpu) - bnx2fc_percpu_thread_destroy(cpu); - put_online_cpus(); +stop_thread: kthread_stop(l2_thread); free_wq: destroy_workqueue(bnx2fc_wq); @@ -2804,7 +2775,6 @@ static void __exit bnx2fc_mod_exit(void) struct fcoe_percpu_s *bg; struct task_struct *l2_thread; struct sk_buff *skb; - unsigned int cpu = 0; /* * NOTE: Since cnic calls register_driver routine rtnl_lock, @@ -2845,16 +2815,7 @@ static void __exit bnx2fc_mod_exit(void) if (l2_thread) kthread_stop(l2_thread); - get_online_cpus(); - /* Destroy per cpu threads */ - for_each_online_cpu(cpu) { - bnx2fc_percpu_thread_destroy(cpu); - } - - cpuhp_remove_state_nocalls_cpuslocked(bnx2fc_online_state); - cpuhp_remove_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2FC_DEAD); - - put_online_cpus(); + cpuhp_remove_state(bnx2fc_online_state); destroy_workqueue(bnx2fc_wq); /* diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index b56573bf440d..2e7b1731ad12 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -39,7 +39,6 @@ enum cpuhp_state { CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_INTEL_DEAD, CPUHP_LUSTRE_CFS_DEAD, - CPUHP_SCSI_BNX2FC_DEAD, CPUHP_SCSI_BNX2I_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, -- cgit v1.2.3-71-gd317 From f9f22a86912f9d36b50e9b3b383fabfb9f22dd46 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Jul 2017 12:53:00 +0200 Subject: scsi: bnx2i: Simplify cpu hotplug code The CPU hotplug related code of this driver can be simplified by: 1) Consolidating the callbacks into a single state. The CPU thread can be torn down on the CPU which goes offline. There is no point in delaying that to the CPU dead state 2) Let the core code invoke the online/offline callbacks and remove the extra for_each_online_cpu() loops. Signed-off-by: Thomas Gleixner Acked-by: Chad Dupuis Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2i/bnx2i_init.c | 65 ++++++++++------------------------------- include/linux/cpuhotplug.h | 1 - 2 files changed, 15 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/drivers/scsi/bnx2i/bnx2i_init.c b/drivers/scsi/bnx2i/bnx2i_init.c index 7487b653e799..4ebcda8d9500 100644 --- a/drivers/scsi/bnx2i/bnx2i_init.c +++ b/drivers/scsi/bnx2i/bnx2i_init.c @@ -404,12 +404,11 @@ int bnx2i_get_stats(void *handle) /** - * bnx2i_percpu_thread_create - Create a receive thread for an - * online CPU + * bnx2i_cpu_online - Create a receive thread for an online CPU * * @cpu: cpu index for the online cpu */ -static void bnx2i_percpu_thread_create(unsigned int cpu) +static int bnx2i_cpu_online(unsigned int cpu) { struct bnx2i_percpu_s *p; struct task_struct *thread; @@ -419,16 +418,17 @@ static void bnx2i_percpu_thread_create(unsigned int cpu) thread = kthread_create_on_node(bnx2i_percpu_io_thread, (void *)p, cpu_to_node(cpu), "bnx2i_thread/%d", cpu); + if (IS_ERR(thread)) + return PTR_ERR(thread); + /* bind thread to the cpu */ - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - p->iothread = thread; - wake_up_process(thread); - } + kthread_bind(thread, cpu); + p->iothread = thread; + wake_up_process(thread); + return 0; } - -static void bnx2i_percpu_thread_destroy(unsigned int cpu) +static int bnx2i_cpu_offline(unsigned int cpu) { struct bnx2i_percpu_s *p; struct task_struct *thread; @@ -451,19 +451,6 @@ static void bnx2i_percpu_thread_destroy(unsigned int cpu) spin_unlock_bh(&p->p_work_lock); if (thread) kthread_stop(thread); -} - -static int bnx2i_cpu_online(unsigned int cpu) -{ - pr_info("bnx2i: CPU %x online: Create Rx thread\n", cpu); - bnx2i_percpu_thread_create(cpu); - return 0; -} - -static int bnx2i_cpu_dead(unsigned int cpu) -{ - pr_info("CPU %x offline: Remove Rx thread\n", cpu); - bnx2i_percpu_thread_destroy(cpu); return 0; } @@ -511,28 +498,14 @@ static int __init bnx2i_mod_init(void) p->iothread = NULL; } - get_online_cpus(); - - for_each_online_cpu(cpu) - bnx2i_percpu_thread_create(cpu); - - err = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, - "scsi/bnx2i:online", - bnx2i_cpu_online, NULL); + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "scsi/bnx2i:online", + bnx2i_cpu_online, bnx2i_cpu_offline); if (err < 0) - goto remove_threads; + goto unreg_driver; bnx2i_online_state = err; - - cpuhp_setup_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2I_DEAD, - "scsi/bnx2i:dead", - NULL, bnx2i_cpu_dead); - put_online_cpus(); return 0; -remove_threads: - for_each_online_cpu(cpu) - bnx2i_percpu_thread_destroy(cpu); - put_online_cpus(); +unreg_driver: cnic_unregister_driver(CNIC_ULP_ISCSI); unreg_xport: iscsi_unregister_transport(&bnx2i_iscsi_transport); @@ -552,7 +525,6 @@ out: static void __exit bnx2i_mod_exit(void) { struct bnx2i_hba *hba; - unsigned cpu = 0; mutex_lock(&bnx2i_dev_lock); while (!list_empty(&adapter_list)) { @@ -570,14 +542,7 @@ static void __exit bnx2i_mod_exit(void) } mutex_unlock(&bnx2i_dev_lock); - get_online_cpus(); - - for_each_online_cpu(cpu) - bnx2i_percpu_thread_destroy(cpu); - - cpuhp_remove_state_nocalls_cpuslocked(bnx2i_online_state); - cpuhp_remove_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2I_DEAD); - put_online_cpus(); + cpuhp_remove_state(bnx2i_online_state); iscsi_unregister_transport(&bnx2i_iscsi_transport); cnic_unregister_driver(CNIC_ULP_ISCSI); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2e7b1731ad12..82b30e638430 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -39,7 +39,6 @@ enum cpuhp_state { CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_INTEL_DEAD, CPUHP_LUSTRE_CFS_DEAD, - CPUHP_SCSI_BNX2I_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, -- cgit v1.2.3-71-gd317 From 6b84202c946cd3da3a8daa92c682510e9ed80321 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 16:24:59 +0800 Subject: sctp: fix the check for _sctp_walk_params and _sctp_walk_errors Commit b1f5bfc27a19 ("sctp: don't dereference ptr before leaving _sctp_walk_{params, errors}()") tried to fix the issue that it may overstep the chunk end for _sctp_walk_{params, errors} with 'chunk_end > offset(length) + sizeof(length)'. But it introduced a side effect: When processing INIT, it verifies the chunks with 'param.v == chunk_end' after iterating all params by sctp_walk_params(). With the check 'chunk_end > offset(length) + sizeof(length)', it would return when the last param is not yet accessed. Because the last param usually is fwdtsn supported param whose size is 4 and 'chunk_end == offset(length) + sizeof(length)' This is a badly issue even causing sctp couldn't process 4-shakes. Client would always get abort when connecting to server, due to the failure of INIT chunk verification on server. The patch is to use 'chunk_end <= offset(length) + sizeof(length)' instead of 'chunk_end < offset(length) + sizeof(length)' for both _sctp_walk_params and _sctp_walk_errors. Fixes: b1f5bfc27a19 ("sctp: don't dereference ptr before leaving _sctp_walk_{params, errors}()") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 980807d7506f..45fd4c6056b5 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -469,7 +469,7 @@ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member) #define _sctp_walk_params(pos, chunk, end, member)\ for (pos.v = chunk->member;\ - (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <\ + (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <=\ (void *)chunk + end) &&\ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ ntohs(pos.p->length) >= sizeof(struct sctp_paramhdr);\ @@ -481,7 +481,7 @@ _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length)) #define _sctp_walk_errors(err, chunk_hdr, end)\ for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \ sizeof(struct sctp_chunkhdr));\ - ((void *)err + offsetof(sctp_errhdr_t, length) + sizeof(err->length) <\ + ((void *)err + offsetof(sctp_errhdr_t, length) + sizeof(err->length) <=\ (void *)chunk_hdr + end) &&\ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\ ntohs(err->length) >= sizeof(sctp_errhdr_t); \ -- cgit v1.2.3-71-gd317 From a3287c41ff405025bc57b165a0f6cd698bbbc1be Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 25 Jul 2017 16:30:34 +0100 Subject: drivers/perf: arm_pmu: Request PMU SPIs with IRQF_PER_CPU Since the PMU register interface is banked per CPU, CPU PMU interrrupts cannot be handled by a CPU other than the one with the PMU asserting the interrupt. This means that migrating PMU SPIs, as we do during a CPU hotplug operation doesn't make any sense and can lead to the IRQ being disabled entirely if we route a spurious IRQ to the new affinity target. This has been observed in practice on AMD Seattle, where CPUs on the non-boot cluster appear to take a spurious PMU IRQ when coming online, which is routed to CPU0 where it cannot be handled. This patch passes IRQF_PERCPU for PMU SPIs and forcefully sets their affinity prior to requesting them, ensuring that they cannot be migrated during hotplug events. This interacts badly with the DB8500 erratum workaround that ping-pongs the interrupt affinity from the handler, so we avoid passing IRQF_PERCPU in that case by allowing the IRQ flags to be overridden in the platdata. Fixes: 3cf7ee98b848 ("drivers/perf: arm_pmu: move irq request/free into probe") Cc: Mark Rutland Cc: Linus Walleij Signed-off-by: Will Deacon --- arch/arm/mach-ux500/cpu-db8500.c | 1 + drivers/perf/arm_pmu.c | 41 ++++++++++++++++++++++++++-------------- include/linux/perf/arm_pmu.h | 4 ++++ 3 files changed, 32 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-ux500/cpu-db8500.c b/arch/arm/mach-ux500/cpu-db8500.c index 28083ef72819..71a34e8c345a 100644 --- a/arch/arm/mach-ux500/cpu-db8500.c +++ b/arch/arm/mach-ux500/cpu-db8500.c @@ -133,6 +133,7 @@ static irqreturn_t db8500_pmu_handler(int irq, void *dev, irq_handler_t handler) static struct arm_pmu_platdata db8500_pmu_platdata = { .handle_irq = db8500_pmu_handler, + .irq_flags = IRQF_NOBALANCING | IRQF_NO_THREAD, }; static struct of_dev_auxdata u8500_auxdata_lookup[] __initdata = { diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index dc459eb1246b..1c5e0f333779 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -569,22 +569,41 @@ int armpmu_request_irq(struct arm_pmu *armpmu, int cpu) if (irq != other_irq) { pr_warn("mismatched PPIs detected.\n"); err = -EINVAL; + goto err_out; } } else { - err = request_irq(irq, handler, - IRQF_NOBALANCING | IRQF_NO_THREAD, "arm-pmu", + struct arm_pmu_platdata *platdata = armpmu_get_platdata(armpmu); + unsigned long irq_flags; + + err = irq_force_affinity(irq, cpumask_of(cpu)); + + if (err && num_possible_cpus() > 1) { + pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n", + irq, cpu); + goto err_out; + } + + if (platdata && platdata->irq_flags) { + irq_flags = platdata->irq_flags; + } else { + irq_flags = IRQF_PERCPU | + IRQF_NOBALANCING | + IRQF_NO_THREAD; + } + + err = request_irq(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&hw_events->percpu_pmu, cpu)); } - if (err) { - pr_err("unable to request IRQ%d for ARM PMU counters\n", - irq); - return err; - } + if (err) + goto err_out; cpumask_set_cpu(cpu, &armpmu->active_irqs); - return 0; + +err_out: + pr_err("unable to request IRQ%d for ARM PMU counters\n", irq); + return err; } int armpmu_request_irqs(struct arm_pmu *armpmu) @@ -628,12 +647,6 @@ static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node) enable_percpu_irq(irq, IRQ_TYPE_NONE); return 0; } - - if (irq_force_affinity(irq, cpumask_of(cpu)) && - num_possible_cpus() > 1) { - pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n", - irq, cpu); - } } return 0; diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 1360dd6d5e61..af0f44effd44 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -24,10 +24,14 @@ * interrupt and passed the address of the low level handler, * and can be used to implement any platform specific handling * before or after calling it. + * + * @irq_flags: if non-zero, these flags will be passed to request_irq + * when requesting interrupts for this PMU device. */ struct arm_pmu_platdata { irqreturn_t (*handle_irq)(int irq, void *dev, irq_handler_t pmu_handler); + unsigned long irq_flags; }; #ifdef CONFIG_ARM_PMU -- cgit v1.2.3-71-gd317 From 8397913303abc9333f376a518a8368fa22ca5e6e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 27 Jul 2017 12:21:11 +0200 Subject: genirq/cpuhotplug: Revert "Set force affinity flag on hotplug migration" That commit was part of the changes moving x86 to the generic CPU hotplug interrupt migration code. The force flag was required on x86 before the hierarchical irqdomain rework, but invoking set_affinity() with force=true stayed and had no side effects. At some point in the past, the force flag got repurposed to support the exynos timer interrupt affinity setting to a not yet online CPU, so the interrupt controller callback does not verify the supplied affinity mask against cpu_online_mask. Setting the flag in the CPU hotplug code causes the cpu online masking to be blocked on these irq controllers and results in potentially affining an interrupt to the CPU which is unplugged, i.e. instead of moving it away, it's just reassigned to it. As the force flags is not longer needed on x86, it's safe to revert that patch so the ARM irqchips which use the force flag work again. Add comments to that effect, so this won't happen again. Note: The online mask handling should be done in the generic code and the force flag and the masking in the irq chips removed all together, but that's not a change possible for 4.13. Fixes: 77f85e66aa8b ("genirq/cpuhotplug: Set force affinity flag on hotplug migration") Reported-by: Will Deacon Signed-off-by: Thomas Gleixner Acked-by: Will Deacon Cc: Marc Zyngier Cc: Russell King Cc: LAK Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1707271217590.3109@nanos Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 7 ++++++- kernel/irq/cpuhotplug.c | 9 +++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 00db35b61e9e..d2d543794093 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -388,7 +388,12 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_mask_ack: ack and mask an interrupt source * @irq_unmask: unmask an interrupt source * @irq_eoi: end of interrupt - * @irq_set_affinity: set the CPU affinity on SMP machines + * @irq_set_affinity: Set the CPU affinity on SMP machines. If the force + * argument is true, it tells the driver to + * unconditionally apply the affinity setting. Sanity + * checks against the supplied affinity mask are not + * required. This is used for CPU hotplug where the + * target CPU is not yet set in the cpu_online_mask. * @irq_retrigger: resend an IRQ to the CPU * @irq_set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ * @irq_set_wake: enable/disable power-management wake-on of an IRQ diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index aee8f7ec40af..638eb9c83d9f 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -95,8 +95,13 @@ static bool migrate_one_irq(struct irq_desc *desc) affinity = cpu_online_mask; brokeaff = true; } - - err = irq_do_set_affinity(d, affinity, true); + /* + * Do not set the force argument of irq_do_set_affinity() as this + * disables the masking of offline CPUs from the supplied affinity + * mask and therefore might keep/reassign the irq to the outgoing + * CPU. + */ + err = irq_do_set_affinity(d, affinity, false); if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); -- cgit v1.2.3-71-gd317 From 0b794ffae7afa7c4e5accac8791c4b78e8d080ce Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Thu, 25 May 2017 15:11:26 +0300 Subject: net/mlx5: Fix mlx5_ifc_mtpps_reg_bits structure size Fix miscalculation in reserved_at_1a0 field. Fixes: ee7f12205abc ('net/mlx5e: Implement 1PPS support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 87869c04849a..fd98aef4545c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8175,7 +8175,7 @@ struct mlx5_ifc_mtpps_reg_bits { u8 out_pulse_duration[0x10]; u8 out_periodic_adjustment[0x10]; - u8 reserved_at_1a0[0x60]; + u8 reserved_at_1a0[0x40]; }; struct mlx5_ifc_mtppse_reg_bits { -- cgit v1.2.3-71-gd317 From fa3676885e3b5be1edfa1b2cc775e20a45b34a19 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Thu, 25 May 2017 16:09:34 +0300 Subject: net/mlx5e: Add field select to MTPPS register In order to mark relevant fields while setting the MTPPS register add field select. Otherwise it can cause a misconfiguration in firmware. Fixes: ee7f12205abc ('net/mlx5e: Implement 1PPS support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_clock.c | 29 +++++++++++++++++----- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 5 ++++ include/linux/mlx5/mlx5_ifc.h | 10 +++++--- 4 files changed, 36 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c index 66f432385dbb..ab07233e2faa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c @@ -53,6 +53,15 @@ enum { MLX5E_EVENT_MODE_ONCE_TILL_ARM = 0x2, }; +enum { + MLX5E_MTPPS_FS_ENABLE = BIT(0x0), + MLX5E_MTPPS_FS_PATTERN = BIT(0x2), + MLX5E_MTPPS_FS_PIN_MODE = BIT(0x3), + MLX5E_MTPPS_FS_TIME_STAMP = BIT(0x4), + MLX5E_MTPPS_FS_OUT_PULSE_DURATION = BIT(0x5), + MLX5E_MTPPS_FS_ENH_OUT_PER_ADJ = BIT(0x7), +}; + void mlx5e_fill_hwstamp(struct mlx5e_tstamp *tstamp, u64 timestamp, struct skb_shared_hwtstamps *hwts) { @@ -221,7 +230,10 @@ static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta) /* For future use need to add a loop for finding all 1PPS out pins */ MLX5_SET(mtpps_reg, in, pin_mode, MLX5E_PIN_MODE_OUT); - MLX5_SET(mtpps_reg, in, out_periodic_adjustment, delta & 0xFFFF); + MLX5_SET(mtpps_reg, in, enhanced_out_periodic_adjustment, delta); + MLX5_SET(mtpps_reg, in, field_select, + MLX5E_MTPPS_FS_PIN_MODE | + MLX5E_MTPPS_FS_ENH_OUT_PER_ADJ); mlx5_set_mtpps(priv->mdev, in, sizeof(in)); } @@ -257,8 +269,7 @@ static int mlx5e_extts_configure(struct ptp_clock_info *ptp, int pin = -1; int err = 0; - if (!MLX5_CAP_GEN(priv->mdev, pps) || - !MLX5_CAP_GEN(priv->mdev, pps_modify)) + if (!MLX5_PPS_CAP(priv->mdev)) return -EOPNOTSUPP; if (rq->extts.index >= tstamp->ptp_info.n_pins) @@ -277,6 +288,9 @@ static int mlx5e_extts_configure(struct ptp_clock_info *ptp, MLX5_SET(mtpps_reg, in, pin_mode, MLX5E_PIN_MODE_IN); MLX5_SET(mtpps_reg, in, pattern, pattern); MLX5_SET(mtpps_reg, in, enable, on); + MLX5_SET(mtpps_reg, in, field_select, MLX5E_MTPPS_FS_PIN_MODE | + MLX5E_MTPPS_FS_PATTERN | + MLX5E_MTPPS_FS_ENABLE); err = mlx5_set_mtpps(priv->mdev, in, sizeof(in)); if (err) @@ -302,7 +316,7 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp, int pin = -1; s64 ns; - if (!MLX5_CAP_GEN(priv->mdev, pps_modify)) + if (!MLX5_PPS_CAP(priv->mdev)) return -EOPNOTSUPP; if (rq->perout.index >= tstamp->ptp_info.n_pins) @@ -337,7 +351,10 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp, MLX5_SET(mtpps_reg, in, pattern, MLX5E_OUT_PATTERN_PERIODIC); MLX5_SET(mtpps_reg, in, enable, on); MLX5_SET64(mtpps_reg, in, time_stamp, time_stamp); - + MLX5_SET(mtpps_reg, in, field_select, MLX5E_MTPPS_FS_PIN_MODE | + MLX5E_MTPPS_FS_PATTERN | + MLX5E_MTPPS_FS_ENABLE | + MLX5E_MTPPS_FS_TIME_STAMP); return mlx5_set_mtpps(priv->mdev, in, sizeof(in)); } @@ -487,7 +504,7 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv) #define MAX_PIN_NUM 8 tstamp->pps_pin_caps = kzalloc(sizeof(u8) * MAX_PIN_NUM, GFP_KERNEL); if (tstamp->pps_pin_caps) { - if (MLX5_CAP_GEN(priv->mdev, pps)) + if (MLX5_PPS_CAP(priv->mdev)) mlx5e_get_pps_caps(priv, tstamp); if (tstamp->ptp_info.n_pins) mlx5e_init_pin_config(tstamp); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index af51a5d2b912..52b9a64cd3a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -698,7 +698,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) else mlx5_core_dbg(dev, "port_module_event is not set\n"); - if (MLX5_CAP_GEN(dev, pps)) + if (MLX5_PPS_CAP(dev)) async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT); if (MLX5_CAP_GEN(dev, fpga)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 6a3d6bef7dd4..6a263e8d883a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -154,6 +154,11 @@ int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size); int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode); int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode); +#define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) && \ + MLX5_CAP_GEN((mdev), pps_modify) && \ + MLX5_CAP_MCAM_FEATURE((mdev), mtpps_fs) && \ + MLX5_CAP_MCAM_FEATURE((mdev), mtpps_enh_out_per_adj)) + int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw); void mlx5e_init(void); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fd98aef4545c..3030121b4746 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -7749,8 +7749,10 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x7f]; + u8 reserved_at_0[0x7d]; + u8 mtpps_enh_out_per_adj[0x1]; + u8 mtpps_fs[0x1]; u8 pcie_performance_group[0x1]; }; @@ -8159,7 +8161,8 @@ struct mlx5_ifc_mtpps_reg_bits { u8 reserved_at_78[0x4]; u8 cap_pin_4_mode[0x4]; - u8 reserved_at_80[0x80]; + u8 field_select[0x20]; + u8 reserved_at_a0[0x60]; u8 enable[0x1]; u8 reserved_at_101[0xb]; @@ -8174,8 +8177,9 @@ struct mlx5_ifc_mtpps_reg_bits { u8 out_pulse_duration[0x10]; u8 out_periodic_adjustment[0x10]; + u8 enhanced_out_periodic_adjustment[0x20]; - u8 reserved_at_1a0[0x40]; + u8 reserved_at_1c0[0x20]; }; struct mlx5_ifc_mtppse_reg_bits { -- cgit v1.2.3-71-gd317 From c9f2c1ae123a751d4e4f949144500219354d5ee1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 27 Jul 2017 14:45:09 +0200 Subject: udp6: fix socket leak on early demux When an early demuxed packet reaches __udp6_lib_lookup_skb(), the sk reference is retrieved and used, but the relevant reference count is leaked and the socket destructor is never called. Beyond leaking the sk memory, if there are pending UDP packets in the receive queue, even the related accounted memory is leaked. In the long run, this will cause persistent forward allocation errors and no UDP skbs (both ipv4 and ipv6) will be able to reach the user-space. Fix this by explicitly accessing the early demux reference before the lookup, and properly decreasing the socket reference count after usage. Also drop the skb_steal_sock() in __udp6_lib_lookup_skb(), and the now obsoleted comment about "socket cache". The newly added code is derived from the current ipv4 code for the similar path. v1 -> v2: fixed the __udp6_lib_rcv() return code for resubmission, as suggested by Eric Reported-by: Sam Edwards Reported-by: Marc Haber Fixes: 5425077d73e0 ("net: ipv6: Add early demux handler for UDP unicast") Signed-off-by: Paolo Abeni Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/udp.h | 1 + net/ipv4/udp.c | 3 ++- net/ipv6/udp.c | 27 ++++++++++++++++++--------- 3 files changed, 21 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 56ce2d2a612d..cc8036987dcb 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -260,6 +260,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, } void udp_v4_early_demux(struct sk_buff *skb); +void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fac7cb9e3b0f..e6276fa3750b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1928,7 +1928,7 @@ drop: /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ -static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; @@ -1937,6 +1937,7 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) dst_release(old); } } +EXPORT_SYMBOL(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4a3e65626e8b..98fe4560e24c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -291,11 +291,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, struct udp_table *udptable) { const struct ipv6hdr *iph = ipv6_hdr(skb); - struct sock *sk; - sk = skb_steal_sock(skb); - if (unlikely(sk)) - return sk; return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), udptable, skb); @@ -804,6 +800,24 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp6_csum_init(skb, uh, proto)) goto csum_error; + /* Check if the socket is already available, e.g. due to early demux */ + sk = skb_steal_sock(skb); + if (sk) { + struct dst_entry *dst = skb_dst(skb); + int ret; + + if (unlikely(sk->sk_rx_dst != dst)) + udp_sk_rx_dst_set(sk, dst); + + ret = udpv6_queue_rcv_skb(sk, skb); + sock_put(sk); + + /* a return value > 0 means to resubmit the input */ + if (ret > 0) + return ret; + return 0; + } + /* * Multicast receive code */ @@ -812,11 +826,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, saddr, daddr, udptable, proto); /* Unicast */ - - /* - * check socket cache ... must talk to Alan about his plans - * for sock caches... i'll skip this for now. - */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) { int ret; -- cgit v1.2.3-71-gd317 From 37ef38f3f83891a2f413fb872bae7d0f9bb95b27 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Thu, 27 Jul 2017 16:15:52 -0500 Subject: tty: pl011: fix initialization order of QDF2400 E44 The work-around for Qualcomm Technologies QDF2400 Erratum 44 hinges on a global variable defined in the pl011 driver. The ACPI SPCR parsing code determines whether the work-around is needed, and if so, it changes the console name from "pl011" to "qdf2400_e44". The expectation is that the pl011 driver will implement the work-around when it sees the console name. The global variable qdf2400_e44_present is set when that happens. The problem is that work-around needs to be enabled when the pl011 driver probes, not when the console name is queried. However, sbsa_probe() is called before pl011_console_match(). The work-around appeared to work previously because the default console on QDF2400 platforms was always ttyAMA1. The first time sbsa_probe() is called (for ttyAMA0), qdf2400_e44_present is still false. Then pl011_console_match() is called, and it sets qdf2400_e44_present to true. All subsequent calls to sbsa_probe() enable the work-around. The solution is to move the global variable into spcr.c and let the pl011 driver query it during probe time. This works because all QDF2400 platforms require SPCR, so parse_spcr() will always be called. pl011_console_match still checks for the "qdf2400_e44" console name, but it doesn't do anything else special. Fixes: 5a0722b898f8 ("tty: pl011: use "qdf2400_e44" as the earlycon name for QDF2400 E44") Tested-by: Jeffrey Hugo Signed-off-by: Timur Tabi Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/spcr.c | 36 ++++++++++++++++++++++++++++++++++-- drivers/tty/serial/amba-pl011.c | 37 +++++++++++++++++++------------------ include/linux/acpi.h | 1 + 3 files changed, 54 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/acpi/spcr.c b/drivers/acpi/spcr.c index 4ac3e06b41d8..98aa8c808a33 100644 --- a/drivers/acpi/spcr.c +++ b/drivers/acpi/spcr.c @@ -16,6 +16,16 @@ #include #include +/* + * Erratum 44 for QDF2432v1 and QDF2400v1 SoCs describes the BUSY bit as + * occasionally getting stuck as 1. To avoid the potential for a hang, check + * TXFE == 0 instead of BUSY == 1. This may not be suitable for all UART + * implementations, so only do so if an affected platform is detected in + * parse_spcr(). + */ +bool qdf2400_e44_present; +EXPORT_SYMBOL(qdf2400_e44_present); + /* * Some Qualcomm Datacenter Technologies SoCs have a defective UART BUSY bit. * Detect them by examining the OEM fields in the SPCR header, similiar to PCI @@ -147,8 +157,30 @@ int __init parse_spcr(bool earlycon) goto done; } - if (qdf2400_erratum_44_present(&table->header)) - uart = "qdf2400_e44"; + /* + * If the E44 erratum is required, then we need to tell the pl011 + * driver to implement the work-around. + * + * The global variable is used by the probe function when it + * creates the UARTs, whether or not they're used as a console. + * + * If the user specifies "traditional" earlycon, the qdf2400_e44 + * console name matches the EARLYCON_DECLARE() statement, and + * SPCR is not used. Parameter "earlycon" is false. + * + * If the user specifies "SPCR" earlycon, then we need to update + * the console name so that it also says "qdf2400_e44". Parameter + * "earlycon" is true. + * + * For consistency, if we change the console name, then we do it + * for everyone, not just earlycon. + */ + if (qdf2400_erratum_44_present(&table->header)) { + qdf2400_e44_present = true; + if (earlycon) + uart = "qdf2400_e44"; + } + if (xgene_8250_erratum_present(table)) iotype = "mmio32"; diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 8a857bb34fbb..1888d168a41c 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -142,15 +142,7 @@ static struct vendor_data vendor_sbsa = { .fixed_options = true, }; -/* - * Erratum 44 for QDF2432v1 and QDF2400v1 SoCs describes the BUSY bit as - * occasionally getting stuck as 1. To avoid the potential for a hang, check - * TXFE == 0 instead of BUSY == 1. This may not be suitable for all UART - * implementations, so only do so if an affected platform is detected in - * parse_spcr(). - */ -static bool qdf2400_e44_present = false; - +#ifdef CONFIG_ACPI_SPCR_TABLE static struct vendor_data vendor_qdt_qdf2400_e44 = { .reg_offset = pl011_std_offsets, .fr_busy = UART011_FR_TXFE, @@ -165,6 +157,7 @@ static struct vendor_data vendor_qdt_qdf2400_e44 = { .always_enabled = true, .fixed_options = true, }; +#endif static u16 pl011_st_offsets[REG_ARRAY_SIZE] = { [REG_DR] = UART01x_DR, @@ -2375,12 +2368,14 @@ static int __init pl011_console_match(struct console *co, char *name, int idx, resource_size_t addr; int i; - if (strcmp(name, "qdf2400_e44") == 0) { - pr_info_once("UART: Working around QDF2400 SoC erratum 44"); - qdf2400_e44_present = true; - } else if (strcmp(name, "pl011") != 0) { + /* + * Systems affected by the Qualcomm Technologies QDF2400 E44 erratum + * have a distinct console name, so make sure we check for that. + * The actual implementation of the erratum occurs in the probe + * function. + */ + if ((strcmp(name, "qdf2400_e44") != 0) && (strcmp(name, "pl011") != 0)) return -ENODEV; - } if (uart_parse_earlycon(options, &iotype, &addr, &options)) return -ENODEV; @@ -2734,11 +2729,17 @@ static int sbsa_uart_probe(struct platform_device *pdev) } uap->port.irq = ret; - uap->reg_offset = vendor_sbsa.reg_offset; - uap->vendor = qdf2400_e44_present ? - &vendor_qdt_qdf2400_e44 : &vendor_sbsa; +#ifdef CONFIG_ACPI_SPCR_TABLE + if (qdf2400_e44_present) { + dev_info(&pdev->dev, "working around QDF2400 SoC erratum 44\n"); + uap->vendor = &vendor_qdt_qdf2400_e44; + } else +#endif + uap->vendor = &vendor_sbsa; + + uap->reg_offset = uap->vendor->reg_offset; uap->fifosize = 32; - uap->port.iotype = vendor_sbsa.access_32b ? UPIO_MEM32 : UPIO_MEM; + uap->port.iotype = uap->vendor->access_32b ? UPIO_MEM32 : UPIO_MEM; uap->port.ops = &sbsa_uart_pops; uap->fixed_baud = baudrate; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c749eef1daa1..27b4b6615263 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1209,6 +1209,7 @@ static inline bool acpi_has_watchdog(void) { return false; } #endif #ifdef CONFIG_ACPI_SPCR_TABLE +extern bool qdf2400_e44_present; int parse_spcr(bool earlycon); #else static inline int parse_spcr(bool earlycon) { return 0; } -- cgit v1.2.3-71-gd317 From a627b0a7c15ee4d2c87a86d5be5c8167382e8d0d Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Sun, 30 Jul 2017 22:30:11 -0400 Subject: ext4: remove unused metadata accounting variables Two variables in ext4_inode_info, i_reserved_meta_blocks and i_allocated_meta_blocks, are unused. Removing them saves a little memory per in-memory inode and cleans up clutter in several tracepoints. Adjust tracepoint output from ext4_alloc_da_blocks() for consistency and fix a typo and whitespace near these changes. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 6 ++---- fs/ext4/super.c | 2 -- include/trace/events/ext4.h | 35 ++++++++--------------------------- 3 files changed, 10 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9ebde0cd632e..dcbcd9d444d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -961,7 +961,7 @@ struct ext4_inode_info { /* * i_block_group is the number of the block group which contains * this file's inode. Constant across the lifetime of the inode, - * it is ued for making block allocation decisions - we try to + * it is used for making block allocation decisions - we try to * place a file's data blocks near its inode block, and new inodes * near to their parent directory's inode. */ @@ -1049,10 +1049,8 @@ struct ext4_inode_info { ext4_group_t i_last_alloc_group; /* allocation reservation info for delalloc */ - /* In case of bigalloc, these refer to clusters rather than blocks */ + /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; - unsigned int i_reserved_meta_blocks; - unsigned int i_allocated_meta_blocks; ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0886fe82e9c4..d61a70e2193a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -978,8 +978,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; - ei->i_reserved_meta_blocks = 0; - ei->i_allocated_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index dfae175ddebc..9c3bc3883d2f 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -937,21 +937,19 @@ TRACE_EVENT(ext4_alloc_da_blocks, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( unsigned int, data_blocks ) - __field( unsigned int, meta_blocks ) + __field( unsigned int, data_blocks ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; - __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; ), - TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u", + TP_printk("dev %d,%d ino %lu reserved_data_blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->data_blocks, __entry->meta_blocks) + __entry->data_blocks) ); TRACE_EVENT(ext4_mballoc_alloc, @@ -1153,8 +1151,6 @@ TRACE_EVENT(ext4_da_update_reserve_space, __field( __u64, i_blocks ) __field( int, used_blocks ) __field( int, reserved_data_blocks ) - __field( int, reserved_meta_blocks ) - __field( int, allocated_meta_blocks ) __field( int, quota_claim ) __field( __u16, mode ) ), @@ -1166,22 +1162,16 @@ TRACE_EVENT(ext4_da_update_reserve_space, __entry->used_blocks = used_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; - __entry->reserved_meta_blocks = - EXT4_I(inode)->i_reserved_meta_blocks; - __entry->allocated_meta_blocks = - EXT4_I(inode)->i_allocated_meta_blocks; __entry->quota_claim = quota_claim; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " - "reserved_data_blocks %d reserved_meta_blocks %d " - "allocated_meta_blocks %d quota_claim %d", + "reserved_data_blocks %d quota_claim %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, - __entry->reserved_meta_blocks, __entry->allocated_meta_blocks, __entry->quota_claim) ); @@ -1195,7 +1185,6 @@ TRACE_EVENT(ext4_da_reserve_space, __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, reserved_data_blocks ) - __field( int, reserved_meta_blocks ) __field( __u16, mode ) ), @@ -1204,17 +1193,15 @@ TRACE_EVENT(ext4_da_reserve_space, __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; - __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu " - "reserved_data_blocks %d reserved_meta_blocks %d", + "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, - __entry->reserved_data_blocks, - __entry->reserved_meta_blocks) + __entry->reserved_data_blocks) ); TRACE_EVENT(ext4_da_release_space, @@ -1228,8 +1215,6 @@ TRACE_EVENT(ext4_da_release_space, __field( __u64, i_blocks ) __field( int, freed_blocks ) __field( int, reserved_data_blocks ) - __field( int, reserved_meta_blocks ) - __field( int, allocated_meta_blocks ) __field( __u16, mode ) ), @@ -1239,19 +1224,15 @@ TRACE_EVENT(ext4_da_release_space, __entry->i_blocks = inode->i_blocks; __entry->freed_blocks = freed_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; - __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; - __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " - "reserved_data_blocks %d reserved_meta_blocks %d " - "allocated_meta_blocks %d", + "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, - __entry->freed_blocks, __entry->reserved_data_blocks, - __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) + __entry->freed_blocks, __entry->reserved_data_blocks) ); DECLARE_EVENT_CLASS(ext4__bitmap_load, -- cgit v1.2.3-71-gd317 From 99f828436788f0155798145853607ca8f0e6de93 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Jul 2017 22:29:51 +0100 Subject: dma-buf/sync_file: Allow multiple sync_files to wrap a single dma-fence Up until recently sync_file were create to export a single dma-fence to userspace, and so we could canabalise a bit insie dma-fence to mark whether or not we had enable polling for the sync_file itself. However, with the advent of syncobj, we do allow userspace to create multiple sync_files for a single dma-fence. (Similarly, that the sw-sync validation framework also started returning multiple sync-files wrapping a single dma-fence for a syncpt also triggering the problem.) This patch reverts my suggestion in commit e24165537312 ("dma-buf/sync_file: only enable fence signalling on poll()") to use a single bit in the shared dma-fence and restores the sync_file->flags for tracking the bits individually. Reported-by: Gustavo Padovan Fixes: f1e8c67123cf ("dma-buf/sw-sync: Use an rbtree to sort fences in the timeline") Fixes: e9083420bbac ("drm: introduce sync objects (v4)") Signed-off-by: Chris Wilson Cc: Sumit Semwal Cc: Sean Paul Cc: Gustavo Padovan Cc: dri-devel@lists.freedesktop.org Cc: # v4.13-rc1+ Signed-off-by: Gustavo Padovan Link: http://patchwork.freedesktop.org/patch/msgid/20170728212951.7818-1-chris@chris-wilson.co.uk (cherry picked from commit db1fc97ca0c0d3fdeeadf314d99a26188438940a) --- drivers/dma-buf/sync_file.c | 5 +++-- include/linux/sync_file.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/dma-buf/sync_file.c b/drivers/dma-buf/sync_file.c index d7e219d2669d..66fb40d0ebdb 100644 --- a/drivers/dma-buf/sync_file.c +++ b/drivers/dma-buf/sync_file.c @@ -304,7 +304,7 @@ static int sync_file_release(struct inode *inode, struct file *file) { struct sync_file *sync_file = file->private_data; - if (test_bit(POLL_ENABLED, &sync_file->fence->flags)) + if (test_bit(POLL_ENABLED, &sync_file->flags)) dma_fence_remove_callback(sync_file->fence, &sync_file->cb); dma_fence_put(sync_file->fence); kfree(sync_file); @@ -318,7 +318,8 @@ static unsigned int sync_file_poll(struct file *file, poll_table *wait) poll_wait(file, &sync_file->wq, wait); - if (!test_and_set_bit(POLL_ENABLED, &sync_file->fence->flags)) { + if (list_empty(&sync_file->cb.node) && + !test_and_set_bit(POLL_ENABLED, &sync_file->flags)) { if (dma_fence_add_callback(sync_file->fence, &sync_file->cb, fence_check_cb_func) < 0) wake_up_all(&sync_file->wq); diff --git a/include/linux/sync_file.h b/include/linux/sync_file.h index 5726107963b2..0ad87c434ae6 100644 --- a/include/linux/sync_file.h +++ b/include/linux/sync_file.h @@ -43,12 +43,13 @@ struct sync_file { #endif wait_queue_head_t wq; + unsigned long flags; struct dma_fence *fence; struct dma_fence_cb cb; }; -#define POLL_ENABLED DMA_FENCE_FLAG_USER_BITS +#define POLL_ENABLED 0 struct sync_file *sync_file_create(struct dma_fence *fence); struct dma_fence *sync_file_get_fence(int fd); -- cgit v1.2.3-71-gd317 From 9c80034921d1ece5c0e3241ba1645bce32387684 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 29 Jul 2017 14:11:43 +0200 Subject: i2c: rephrase explanation of I2C_CLASS_DEPRECATED Hopefully making clear that it is not needed for new drivers. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 00ca5b86a753..d501d3956f13 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -689,7 +689,8 @@ i2c_unlock_adapter(struct i2c_adapter *adapter) #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ #define I2C_CLASS_DDC (1<<3) /* DDC bus on graphics adapters */ #define I2C_CLASS_SPD (1<<7) /* Memory modules */ -#define I2C_CLASS_DEPRECATED (1<<8) /* Warn users that adapter will stop using classes */ +/* Warn users that the adapter doesn't support classes anymore */ +#define I2C_CLASS_DEPRECATED (1<<8) /* Internal numbers to terminate lists */ #define I2C_CLIENT_END 0xfffeU -- cgit v1.2.3-71-gd317 From cb891fa6a1d5f52c5f5c07b6f7f4c6d65ea55fc0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 31 Jul 2017 16:52:36 +0200 Subject: udp6: fix jumbogram reception Since commit 67a51780aebb ("ipv6: udp: leverage scratch area helpers") udp6_recvmsg() read the skb len from the scratch area, to avoid a cache miss. But the UDP6 rx path support RFC 2675 UDPv6 jumbograms, and their length exceeds the 16 bits available in the scratch area. As a side effect the length returned by recvmsg() is: % (1<<16) This commit addresses the issue allocating one more bit in the IP6CB flags field and setting it for incoming jumbograms. Such field is still in the first cacheline, so at recvmsg() time we can check it and fallback to access skb->len if required, without a measurable overhead. Fixes: 67a51780aebb ("ipv6: udp: leverage scratch area helpers") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/ipv6.h | 6 ++++++ net/ipv6/exthdrs.c | 1 + net/ipv6/udp.c | 11 ++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index e1b442996f81..474d6bbc158c 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -128,6 +128,7 @@ struct inet6_skb_parm { #define IP6SKB_FRAGMENTED 16 #define IP6SKB_HOPBYHOP 32 #define IP6SKB_L3SLAVE 64 +#define IP6SKB_JUMBOGRAM 128 }; #if defined(CONFIG_NET_L3_MASTER_DEV) @@ -152,6 +153,11 @@ static inline int inet6_iif(const struct sk_buff *skb) return l3_slave ? skb->skb_iif : IP6CB(skb)->iif; } +static inline bool inet6_is_jumbogram(const struct sk_buff *skb) +{ + return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM); +} + /* can not be used in TCP layer after tcp_v6_fill_cb */ static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) { diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 4996d734f1d2..3cec529c6113 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -756,6 +756,7 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto drop; + IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM; return true; drop: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 98fe4560e24c..578142b7ca3e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -328,6 +328,15 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be EXPORT_SYMBOL_GPL(udp6_lib_lookup); #endif +/* do not use the scratch area len for jumbogram: their length execeeds the + * scratch area space; note that the IP6CB flags is still in the first + * cacheline, so checking for jumbograms is cheap + */ +static int udp6_skb_len(struct sk_buff *skb) +{ + return unlikely(inet6_is_jumbogram(skb)) ? skb->len : udp_skb_len(skb); +} + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -358,7 +367,7 @@ try_again: if (!skb) return err; - ulen = udp_skb_len(skb); + ulen = udp6_skb_len(skb); copied = len; if (copied > ulen - off) copied = ulen - off; -- cgit v1.2.3-71-gd317 From e17e8969f5c59a10083af5e260bdad6026872203 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Jul 2017 16:43:49 +0200 Subject: libceph: fallback for when there isn't a pool-specific choose_arg There is now a fallback to a choose_arg index of -1 if there isn't a pool-specific choose_arg set. If you create a per-pool weight-set, that works for that pool. Otherwise we try the compat/default one. If that doesn't exist either, then we use the normal CRUSH weights. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 2 +- net/ceph/osdmap.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 92e165d417a6..07eed95e10c7 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -193,7 +193,7 @@ struct crush_choose_arg { struct crush_choose_arg_map { #ifdef __KERNEL__ struct rb_node node; - u64 choose_args_index; + s64 choose_args_index; #endif struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 64ae9f89773a..eb57a06373ca 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2301,10 +2301,17 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, } } +/* + * Magic value used for a "default" fallback choose_args, used if the + * crush_choose_arg_map passed to do_crush() does not exist. If this + * also doesn't exist, fall back to canonical weights. + */ +#define CEPH_DEFAULT_CHOOSE_ARGS -1 + static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - u64 choose_args_index) + s64 choose_args_index) { struct crush_choose_arg_map *arg_map; int r; @@ -2313,6 +2320,9 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, arg_map = lookup_choose_arg_map(&map->crush->choose_args, choose_args_index); + if (!arg_map) + arg_map = lookup_choose_arg_map(&map->crush->choose_args, + CEPH_DEFAULT_CHOOSE_ARGS); mutex_lock(&map->crush_workspace_mutex); r = crush_do_rule(map->crush, ruleno, x, result, result_max, -- cgit v1.2.3-71-gd317 From ae78dd8139ce93a528beb7f3914531b7a7be9e30 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 27 Jul 2017 17:59:14 +0200 Subject: libceph: make RECOVERY_DELETES feature create a new interval This is needed so that the OSDs can regenerate the missing set at the start of a new interval where support for recovery deletes changed. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 1 + include/linux/ceph/osdmap.h | 2 ++ include/linux/ceph/rados.h | 4 ++++ net/ceph/osd_client.c | 5 +++++ net/ceph/osdmap.c | 5 ++++- 5 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c6d96a5f46fd..adf670ecaf94 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -148,6 +148,7 @@ struct ceph_osd_request_target { int size; int min_size; bool sort_bitwise; + bool recovery_deletes; unsigned int flags; /* CEPH_OSD_FLAG_* */ bool paused; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index a0996cb9faed..af3444a5bfdd 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -272,6 +272,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, u32 new_pg_num, bool old_sort_bitwise, bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, const struct ceph_pg *pgid); bool ceph_osds_changed(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 385db08bb8b2..b8281feda9c7 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -158,6 +158,10 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ #define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ #define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ +#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */ +#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */ +#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */ +#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */ /* * The error code to return when an OSD can't handle a write diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b5f016cb9569..dcfbdd74dfd1 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1337,6 +1337,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, bool legacy_change; bool split = false; bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); + bool recovery_deletes = ceph_osdmap_flag(osdc, + CEPH_OSDMAP_RECOVERY_DELETES); enum calc_target_result ct_res; int ret; @@ -1399,6 +1401,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, pi->pg_num, t->sort_bitwise, sort_bitwise, + t->recovery_deletes, + recovery_deletes, &last_pgid)) force_resend = true; @@ -1421,6 +1425,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->pg_num = pi->pg_num; t->pg_num_mask = pi->pg_num_mask; t->sort_bitwise = sort_bitwise; + t->recovery_deletes = recovery_deletes; t->osd = acting.primary; } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 0bec71fa712e..f358d0bfa76b 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2082,6 +2082,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, u32 new_pg_num, bool old_sort_bitwise, bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, const struct ceph_pg *pgid) { return !osds_equal(old_acting, new_acting) || @@ -2089,7 +2091,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, old_size != new_size || old_min_size != new_min_size || ceph_pg_is_split(pgid, old_pg_num, new_pg_num) || - old_sort_bitwise != new_sort_bitwise; + old_sort_bitwise != new_sort_bitwise || + old_recovery_deletes != new_recovery_deletes; } static int calc_pg_rank(int osd, const struct ceph_osds *acting) -- cgit v1.2.3-71-gd317 From fd40559c8657418385e42f797e0b04bfc0add748 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 16:02:47 -0400 Subject: NFSv4: Fix EXCHANGE_ID corrupt verifier issue The verifier is allocated on the stack, but the EXCHANGE_ID RPC call was changed to be asynchronous by commit 8d89bd70bc939. If we interrrupt the call to rpc_wait_for_completion_task(), we can therefore end up transmitting random stack contents in lieu of the verifier. Fixes: 8d89bd70bc939 ("NFS setup async exchange_id") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 11 ++++------- fs/nfs/nfs4xdr.c | 2 +- include/linux/nfs_xdr.h | 2 +- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 93c1d7352238..c458a1e28b91 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7460,7 +7460,7 @@ static void nfs4_exchange_id_done(struct rpc_task *task, void *data) cdata->res.server_scope = NULL; } /* Save the EXCHANGE_ID verifier session trunk tests */ - memcpy(clp->cl_confirm.data, cdata->args.verifier->data, + memcpy(clp->cl_confirm.data, cdata->args.verifier.data, sizeof(clp->cl_confirm.data)); } out: @@ -7497,7 +7497,6 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = { static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, u32 sp4_how, struct rpc_xprt *xprt) { - nfs4_verifier verifier; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], .rpc_cred = cred, @@ -7521,8 +7520,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, return -ENOMEM; } - if (!xprt) - nfs4_init_boot_verifier(clp, &verifier); + nfs4_init_boot_verifier(clp, &calldata->args.verifier); status = nfs4_init_uniform_client_string(clp); if (status) @@ -7563,9 +7561,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, task_setup_data.rpc_xprt = xprt; task_setup_data.flags = RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC; - calldata->args.verifier = &clp->cl_confirm; - } else { - calldata->args.verifier = &verifier; + memcpy(calldata->args.verifier.data, clp->cl_confirm.data, + sizeof(calldata->args.verifier.data)); } calldata->args.client = clp; #ifdef CONFIG_NFS_V4_1_MIGRATION diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index fa3eb361d4f8..37c8af003275 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1785,7 +1785,7 @@ static void encode_exchange_id(struct xdr_stream *xdr, int len = 0; encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); - encode_nfs4_verifier(xdr, args->verifier); + encode_nfs4_verifier(xdr, &args->verifier); encode_string(xdr, strlen(args->client->cl_owner_id), args->client->cl_owner_id); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ca3bcc4ed4e5..62cbcb842f99 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1235,7 +1235,7 @@ struct nfs41_state_protection { struct nfs41_exchange_id_args { struct nfs_client *client; - nfs4_verifier *verifier; + nfs4_verifier verifier; u32 flags; struct nfs41_state_protection state_protect; }; -- cgit v1.2.3-71-gd317 From d9535cb7b7603aeb549c697ecdf92024e4d0a650 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 28 Jul 2017 17:30:02 -0500 Subject: ptp: introduce ptp auxiliary worker Many PTP drivers required to perform some asynchronous or periodic work, like periodically handling PHC counter overflow or handle delayed timestamp for RX/TX network packets. In most of the cases, such work is implemented using workqueues. Unfortunately, Kernel workqueues might introduce significant delay in work scheduling under high system load and on -RT, which could cause misbehavior of PTP drivers due to internal counter overflow, for example, and there is no way to tune its execution policy and priority manuallly. Hence, The kthread_worker can be used insted of workqueues, as it create separte named kthread for each worker and its its execution policy and priority can be configured using chrt tool. This prblem was reported for two drivers TI CPSW CPTS and dp83640, so instead of modifying each of these driver it was proposed to add PTP auxiliary worker to the PHC subsystem. The patch adds PTP auxiliary worker in PHC subsystem using kthread_worker and kthread_delayed_work and introduces two new PHC subsystem APIs: - long (*do_aux_work)(struct ptp_clock_info *ptp) callback in ptp_clock_info structure, which driver should assign if it require to perform asynchronous or periodic work. Driver should return the delay of the PTP next auxiliary work scheduling time (>=0) or negative value in case further scheduling is not required. - int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay) which allows schedule PTP auxiliary work. The name of kthread_worker thread corresponds PTP PHC device name "ptp%d". Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/ptp/ptp_clock.c | 42 ++++++++++++++++++++++++++++++++++++++++ drivers/ptp/ptp_private.h | 3 +++ include/linux/ptp_clock_kernel.h | 20 +++++++++++++++++++ 3 files changed, 65 insertions(+) (limited to 'include') diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index b77435783ef3..7eacc1c4b3b1 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "ptp_private.h" @@ -184,6 +185,19 @@ static void delete_ptp_clock(struct posix_clock *pc) kfree(ptp); } +static void ptp_aux_kworker(struct kthread_work *work) +{ + struct ptp_clock *ptp = container_of(work, struct ptp_clock, + aux_work.work); + struct ptp_clock_info *info = ptp->info; + long delay; + + delay = info->do_aux_work(info); + + if (delay >= 0) + kthread_queue_delayed_work(ptp->kworker, &ptp->aux_work, delay); +} + /* public interface */ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, @@ -217,6 +231,20 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, mutex_init(&ptp->pincfg_mux); init_waitqueue_head(&ptp->tsev_wq); + if (ptp->info->do_aux_work) { + char *worker_name = kasprintf(GFP_KERNEL, "ptp%d", ptp->index); + + kthread_init_delayed_work(&ptp->aux_work, ptp_aux_kworker); + ptp->kworker = kthread_create_worker(0, worker_name ? + worker_name : info->name); + kfree(worker_name); + if (IS_ERR(ptp->kworker)) { + err = PTR_ERR(ptp->kworker); + pr_err("failed to create ptp aux_worker %d\n", err); + goto kworker_err; + } + } + err = ptp_populate_pin_groups(ptp); if (err) goto no_pin_groups; @@ -259,6 +287,9 @@ no_pps: no_device: ptp_cleanup_pin_groups(ptp); no_pin_groups: + if (ptp->kworker) + kthread_destroy_worker(ptp->kworker); +kworker_err: mutex_destroy(&ptp->tsevq_mux); mutex_destroy(&ptp->pincfg_mux); ida_simple_remove(&ptp_clocks_map, index); @@ -274,6 +305,11 @@ int ptp_clock_unregister(struct ptp_clock *ptp) ptp->defunct = 1; wake_up_interruptible(&ptp->tsev_wq); + if (ptp->kworker) { + kthread_cancel_delayed_work_sync(&ptp->aux_work); + kthread_destroy_worker(ptp->kworker); + } + /* Release the clock's resources. */ if (ptp->pps_source) pps_unregister_source(ptp->pps_source); @@ -339,6 +375,12 @@ int ptp_find_pin(struct ptp_clock *ptp, } EXPORT_SYMBOL(ptp_find_pin); +int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay) +{ + return kthread_mod_delayed_work(ptp->kworker, &ptp->aux_work, delay); +} +EXPORT_SYMBOL(ptp_schedule_worker); + /* module operations */ static void __exit ptp_exit(void) diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index d95888974d0c..b86f1bfecd6f 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -56,6 +57,8 @@ struct ptp_clock { struct attribute_group pin_attr_group; /* 1st entry is a pointer to the real group, 2nd is NULL terminator */ const struct attribute_group *pin_attr_groups[2]; + struct kthread_worker *kworker; + struct kthread_delayed_work aux_work; }; /* diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index a026bfd089db..51349d124ee5 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -99,6 +99,11 @@ struct system_device_crosststamp; * parameter func: the desired function to use. * parameter chan: the function channel index to use. * + * @do_work: Request driver to perform auxiliary (periodic) operations + * Driver should return delay of the next auxiliary work scheduling + * time (>=0) or negative value in case further scheduling + * is not required. + * * Drivers should embed their ptp_clock_info within a private * structure, obtaining a reference to it using container_of(). * @@ -126,6 +131,7 @@ struct ptp_clock_info { struct ptp_clock_request *request, int on); int (*verify)(struct ptp_clock_info *ptp, unsigned int pin, enum ptp_pin_function func, unsigned int chan); + long (*do_aux_work)(struct ptp_clock_info *ptp); }; struct ptp_clock; @@ -211,6 +217,16 @@ extern int ptp_clock_index(struct ptp_clock *ptp); int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan); +/** + * ptp_schedule_worker() - schedule ptp auxiliary work + * + * @ptp: The clock obtained from ptp_clock_register(). + * @delay: number of jiffies to wait before queuing + * See kthread_queue_delayed_work() for more info. + */ + +int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay); + #else static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, struct device *parent) @@ -225,6 +241,10 @@ static inline int ptp_clock_index(struct ptp_clock *ptp) static inline int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { return -1; } +static inline int ptp_schedule_worker(struct ptp_clock *ptp, + unsigned long delay) +{ return -EOPNOTSUPP; } + #endif #endif -- cgit v1.2.3-71-gd317 From cdbc78ba702650b02b9e3e957dbaa725423bdf1e Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Thu, 27 Jul 2017 10:42:35 -0600 Subject: drm/msm: Remove __user from __u64 data types __user should be used to identify user pointers and not __u64 variables containing pointers. Signed-off-by: Jordan Crouse Signed-off-by: Rob Clark --- include/uapi/drm/msm_drm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h index 26c54f6d595d..ad4eb2863e70 100644 --- a/include/uapi/drm/msm_drm.h +++ b/include/uapi/drm/msm_drm.h @@ -171,7 +171,7 @@ struct drm_msm_gem_submit_cmd { __u32 size; /* in, cmdstream size */ __u32 pad; __u32 nr_relocs; /* in, number of submit_reloc's */ - __u64 __user relocs; /* in, ptr to array of submit_reloc's */ + __u64 relocs; /* in, ptr to array of submit_reloc's */ }; /* Each buffer referenced elsewhere in the cmdstream submit (ie. the @@ -215,8 +215,8 @@ struct drm_msm_gem_submit { __u32 fence; /* out */ __u32 nr_bos; /* in, number of submit_bo's */ __u32 nr_cmds; /* in, number of submit_cmd's */ - __u64 __user bos; /* in, ptr to array of submit_bo's */ - __u64 __user cmds; /* in, ptr to array of submit_cmd's */ + __u64 bos; /* in, ptr to array of submit_bo's */ + __u64 cmds; /* in, ptr to array of submit_cmd's */ __s32 fence_fd; /* in/out fence fd (see MSM_SUBMIT_FENCE_FD_IN/OUT) */ }; -- cgit v1.2.3-71-gd317 From a477b9cd37aa81a490dfa3265b7ff4f2c5a92463 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Aug 2017 20:11:02 -0500 Subject: PCI: Add pci_reset_function_locked() The implementation of PCI workarounds may require that the device is reset from its probe function. This implies that the PCI device lock is already held, and makes calling pci_reset_function() impossible (since it will itself try to take that lock). Add pci_reset_function_locked(), which is the equivalent of pci_reset_function(), except that it requires the PCI device lock to be already held by the caller. Tested-by: Ard Biesheuvel Signed-off-by: Marc Zyngier [bhelgaas: folded in fix for conflict with 52354b9d1f46 ("PCI: Remove __pci_dev_reset() and pci_dev_reset()")] Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # 4.11: 52354b9d1f46: PCI: Remove __pci_dev_reset() and pci_dev_reset() Cc: stable@vger.kernel.org # 4.11 --- drivers/pci/pci.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/pci.h | 1 + 2 files changed, 36 insertions(+) (limited to 'include') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index af0cc3456dc1..b4b7eab29400 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4259,6 +4259,41 @@ int pci_reset_function(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_reset_function); +/** + * pci_reset_function_locked - quiesce and reset a PCI device function + * @dev: PCI device to reset + * + * Some devices allow an individual function to be reset without affecting + * other functions in the same device. The PCI device must be responsive + * to PCI config space in order to use this function. + * + * This function does not just reset the PCI portion of a device, but + * clears all the state associated with the device. This function differs + * from __pci_reset_function() in that it saves and restores device state + * over the reset. It also differs from pci_reset_function() in that it + * requires the PCI device lock to be held. + * + * Returns 0 if the device function was successfully reset or negative if the + * device doesn't support resetting a single function. + */ +int pci_reset_function_locked(struct pci_dev *dev) +{ + int rc; + + rc = pci_probe_reset_function(dev); + if (rc) + return rc; + + pci_dev_save_and_disable(dev); + + rc = __pci_reset_function_locked(dev); + + pci_dev_restore(dev); + + return rc; +} +EXPORT_SYMBOL_GPL(pci_reset_function_locked); + /** * pci_try_reset_function - quiesce and reset a PCI device function * @dev: PCI device to reset diff --git a/include/linux/pci.h b/include/linux/pci.h index 4869e66dd659..a75c13673852 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1067,6 +1067,7 @@ void pcie_flr(struct pci_dev *dev); int __pci_reset_function(struct pci_dev *dev); int __pci_reset_function_locked(struct pci_dev *dev); int pci_reset_function(struct pci_dev *dev); +int pci_reset_function_locked(struct pci_dev *dev); int pci_try_reset_function(struct pci_dev *dev); int pci_probe_reset_slot(struct pci_slot *slot); int pci_reset_slot(struct pci_slot *slot); -- cgit v1.2.3-71-gd317 From 6d29231000bbe0fb9e4893a9c68151ffdd3b5469 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 31 Jul 2017 10:31:27 +0200 Subject: mtd: nand: Declare tBERS, tR and tPROG as u64 to avoid integer overflow All timings in nand_sdr_timings are expressed in picoseconds but some of them may not fit in an u32. Signed-off-by: Boris Brezillon Fixes: 204e7ecd47e2 ("mtd: nand: Add a few more timings to nand_sdr_timings") Reported-by: Alexander Dahl Cc: Reviewed-by: Alexander Dahl Tested-by: Alexander Dahl Signed-off-by: Boris Brezillon --- drivers/mtd/nand/nand_timings.c | 6 +++--- include/linux/mtd/nand.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/mtd/nand/nand_timings.c b/drivers/mtd/nand/nand_timings.c index f06312df3669..7e36d7d13c26 100644 --- a/drivers/mtd/nand/nand_timings.c +++ b/drivers/mtd/nand/nand_timings.c @@ -311,9 +311,9 @@ int onfi_init_data_interface(struct nand_chip *chip, struct nand_sdr_timings *timings = &iface->timings.sdr; /* microseconds -> picoseconds */ - timings->tPROG_max = 1000000UL * le16_to_cpu(params->t_prog); - timings->tBERS_max = 1000000UL * le16_to_cpu(params->t_bers); - timings->tR_max = 1000000UL * le16_to_cpu(params->t_r); + timings->tPROG_max = 1000000ULL * le16_to_cpu(params->t_prog); + timings->tBERS_max = 1000000ULL * le16_to_cpu(params->t_bers); + timings->tR_max = 1000000ULL * le16_to_cpu(params->t_r); /* nanoseconds -> picoseconds */ timings->tCCS_min = 1000UL * le16_to_cpu(params->t_ccs); diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 892148c448cc..5216d2eb2289 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -681,10 +681,10 @@ struct nand_buffers { * @tWW_min: WP# transition to WE# low */ struct nand_sdr_timings { - u32 tBERS_max; + u64 tBERS_max; u32 tCCS_min; - u32 tPROG_max; - u32 tR_max; + u64 tPROG_max; + u64 tR_max; u32 tALH_min; u32 tADL_min; u32 tALS_min; -- cgit v1.2.3-71-gd317 From c994f778bb1cca8ebe7a4e528cefec233e93b5cc Mon Sep 17 00:00:00 2001 From: Inbar Karmy Date: Tue, 1 Aug 2017 16:43:43 +0300 Subject: net/mlx4_en: Fix wrong indication of Wake-on-LAN (WoL) support Currently when WoL is supported but disabled, ethtool reports: "Supports Wake-on: d". Fix the indication of Wol support, so that the indication remains "g" all the time if the NIC supports WoL. Tested: As accepted, when NIC supports WoL- ethtool reports: Supports Wake-on: g Wake-on: d when NIC doesn't support WoL- ethtool reports: Supports Wake-on: d Wake-on: d Fixes: 14c07b1358ed ("mlx4: Wake on LAN support") Signed-off-by: Inbar Karmy Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 15 ++++++++------- drivers/net/ethernet/mellanox/mlx4/fw.c | 4 ++++ drivers/net/ethernet/mellanox/mlx4/fw.h | 1 + drivers/net/ethernet/mellanox/mlx4/main.c | 2 ++ include/linux/mlx4/device.h | 1 + 5 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index c751a1d434ad..3d4e4a5d00d1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -223,6 +223,7 @@ static void mlx4_en_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct mlx4_en_priv *priv = netdev_priv(netdev); + struct mlx4_caps *caps = &priv->mdev->dev->caps; int err = 0; u64 config = 0; u64 mask; @@ -235,24 +236,24 @@ static void mlx4_en_get_wol(struct net_device *netdev, mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 : MLX4_DEV_CAP_FLAG_WOL_PORT2; - if (!(priv->mdev->dev->caps.flags & mask)) { + if (!(caps->flags & mask)) { wol->supported = 0; wol->wolopts = 0; return; } + if (caps->wol_port[priv->port]) + wol->supported = WAKE_MAGIC; + else + wol->supported = 0; + err = mlx4_wol_read(priv->mdev->dev, &config, priv->port); if (err) { en_err(priv, "Failed to get WoL information\n"); return; } - if (config & MLX4_EN_WOL_MAGIC) - wol->supported = WAKE_MAGIC; - else - wol->supported = 0; - - if (config & MLX4_EN_WOL_ENABLED) + if ((config & MLX4_EN_WOL_ENABLED) && (config & MLX4_EN_WOL_MAGIC)) wol->wolopts = WAKE_MAGIC; else wol->wolopts = 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 37e84a59e751..c165f16623a9 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -764,6 +764,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET 0x3e #define QUERY_DEV_CAP_MAX_PKEY_OFFSET 0x3f #define QUERY_DEV_CAP_EXT_FLAGS_OFFSET 0x40 +#define QUERY_DEV_CAP_WOL_OFFSET 0x43 #define QUERY_DEV_CAP_FLAGS_OFFSET 0x44 #define QUERY_DEV_CAP_RSVD_UAR_OFFSET 0x48 #define QUERY_DEV_CAP_UAR_SZ_OFFSET 0x49 @@ -920,6 +921,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET); dev_cap->flags = flags | (u64)ext_flags << 32; + MLX4_GET(field, outbox, QUERY_DEV_CAP_WOL_OFFSET); + dev_cap->wol_port[1] = !!(field & 0x20); + dev_cap->wol_port[2] = !!(field & 0x40); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET); dev_cap->reserved_uars = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h index 5343a0599253..b52ba01aa486 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.h +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -129,6 +129,7 @@ struct mlx4_dev_cap { u32 dmfs_high_rate_qpn_range; struct mlx4_rate_limit_caps rl_caps; struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1]; + bool wol_port[MLX4_MAX_PORTS + 1]; }; struct mlx4_func_cap { diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index a27c9c13a36e..09b9bc17bce9 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -424,6 +424,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.stat_rate_support = dev_cap->stat_rate_support; dev->caps.max_gso_sz = dev_cap->max_gso_sz; dev->caps.max_rss_tbl_sz = dev_cap->max_rss_tbl_sz; + dev->caps.wol_port[1] = dev_cap->wol_port[1]; + dev->caps.wol_port[2] = dev_cap->wol_port[2]; /* Save uar page shift */ if (!mlx4_is_slave(dev)) { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index aad5d81dfb44..b54517c05e9a 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -620,6 +620,7 @@ struct mlx4_caps { u32 dmfs_high_rate_qpn_base; u32 dmfs_high_rate_qpn_range; u32 vf_caps; + bool wol_port[MLX4_MAX_PORTS + 1]; struct mlx4_rate_limit_caps rl_caps; }; -- cgit v1.2.3-71-gd317 From 3898da947bbaf9e7fd5816e825978d360028bba2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 2 Aug 2017 17:55:54 +0200 Subject: KVM: avoid using rcu_dereference_protected MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During teardown, accesses to memslots and buses are using rcu_dereference_protected with an always-true condition because these accesses are done outside the usual mutexes. This is because the last reference is gone and there cannot be any concurrent modifications, but rcu_dereference_protected is ugly and unobvious. Instead, check the refcount in kvm_get_bus and __kvm_memslots. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- include/linux/kvm_host.h | 6 ++++-- virt/kvm/kvm_main.c | 11 ++++------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 890b706d1943..21a6fd6c44af 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -477,7 +477,8 @@ struct kvm { static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, - lockdep_is_held(&kvm->slots_lock)); + lockdep_is_held(&kvm->slots_lock) || + !refcount_read(&kvm->users_count)); } static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) @@ -570,7 +571,8 @@ void kvm_put_kvm(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, - lockdep_is_held(&kvm->slots_lock)); + lockdep_is_held(&kvm->slots_lock) || + !refcount_read(&kvm->users_count)); } static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f3f74271f1a9..15252d723b54 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -717,10 +717,9 @@ out_err_no_srcu: hardware_disable_all(); out_err_no_disable: for (i = 0; i < KVM_NR_BUSES; i++) - kfree(rcu_access_pointer(kvm->buses[i])); + kfree(kvm_get_bus(kvm, i)); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - kvm_free_memslots(kvm, - rcu_dereference_protected(kvm->memslots[i], 1)); + kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); kvm_arch_free_vm(kvm); mmdrop(current->mm); return ERR_PTR(r); @@ -754,9 +753,8 @@ static void kvm_destroy_vm(struct kvm *kvm) spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { - struct kvm_io_bus *bus; + struct kvm_io_bus *bus = kvm_get_bus(kvm, i); - bus = rcu_dereference_protected(kvm->buses[i], 1); if (bus) kvm_io_bus_destroy(bus); kvm->buses[i] = NULL; @@ -770,8 +768,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - kvm_free_memslots(kvm, - rcu_dereference_protected(kvm->memslots[i], 1)); + kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); kvm_arch_free_vm(kvm); -- cgit v1.2.3-71-gd317 From 3ea277194daaeaa84ce75180ec7c7a2075027a68 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 2 Aug 2017 13:31:52 -0700 Subject: mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries Nadav Amit identified a theoritical race between page reclaim and mprotect due to TLB flushes being batched outside of the PTL being held. He described the race as follows: CPU0 CPU1 ---- ---- user accesses memory using RW PTE [PTE now cached in TLB] try_to_unmap_one() ==> ptep_get_and_clear() ==> set_tlb_ubc_flush_pending() mprotect(addr, PROT_READ) ==> change_pte_range() ==> [ PTE non-present - no flush ] user writes using cached RW PTE ... try_to_unmap_flush() The same type of race exists for reads when protecting for PROT_NONE and also exists for operations that can leave an old TLB entry behind such as munmap, mremap and madvise. For some operations like mprotect, it's not necessarily a data integrity issue but it is a correctness issue as there is a window where an mprotect that limits access still allows access. For munmap, it's potentially a data integrity issue although the race is massive as an munmap, mmap and return to userspace must all complete between the window when reclaim drops the PTL and flushes the TLB. However, it's theoritically possible so handle this issue by flushing the mm if reclaim is potentially currently batching TLB flushes. Other instances where a flush is required for a present pte should be ok as either the page lock is held preventing parallel reclaim or a page reference count is elevated preventing a parallel free leading to corruption. In the case of page_mkclean there isn't an obvious path that userspace could take advantage of without using the operations that are guarded by this patch. Other users such as gup as a race with reclaim looks just at PTEs. huge page variants should be ok as they don't race with reclaim. mincore only looks at PTEs. userfault also should be ok as if a parallel reclaim takes place, it will either fault the page back in or read some of the data before the flush occurs triggering a fault. Note that a variant of this patch was acked by Andy Lutomirski but this was for the x86 parts on top of his PCID work which didn't make the 4.13 merge window as expected. His ack is dropped from this version and there will be a follow-on patch on top of PCID that will include his ack. [akpm@linux-foundation.org: tweak comments] [akpm@linux-foundation.org: fix spello] Link: http://lkml.kernel.org/r/20170717155523.emckq2esjro6hf3z@suse.de Reported-by: Nadav Amit Signed-off-by: Mel Gorman Cc: Andy Lutomirski Cc: [v4.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 4 ++++ mm/internal.h | 5 ++++- mm/madvise.c | 1 + mm/memory.c | 1 + mm/mprotect.c | 1 + mm/mremap.c | 1 + mm/rmap.c | 36 ++++++++++++++++++++++++++++++++++++ 7 files changed, 48 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ff151814a02d..7f384bb62d8e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -494,6 +494,10 @@ struct mm_struct { * PROT_NONE or PROT_NUMA mapped page. */ bool tlb_flush_pending; +#endif +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + /* See flush_tlb_batched_pending() */ + bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/internal.h b/mm/internal.h index 24d88f084705..4ef49fc55e58 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -498,6 +498,7 @@ extern struct workqueue_struct *mm_percpu_wq; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); +void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { @@ -505,7 +506,9 @@ static inline void try_to_unmap_flush(void) static inline void try_to_unmap_flush_dirty(void) { } - +static inline void flush_tlb_batched_pending(struct mm_struct *mm) +{ +} #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; diff --git a/mm/madvise.c b/mm/madvise.c index 9976852f1e1c..47d8d8a25eae 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -320,6 +320,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, tlb_remove_check_page_size_change(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; diff --git a/mm/memory.c b/mm/memory.c index 0e517be91a89..f65beaad319b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1197,6 +1197,7 @@ again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte = start_pte; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; diff --git a/mm/mprotect.c b/mm/mprotect.c index 1a8c9ca83e48..4180ad8cc9c5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -64,6 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, atomic_read(&vma->vm_mm->mm_users) == 1) target_node = numa_node_id(); + flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { oldpte = *pte; diff --git a/mm/mremap.c b/mm/mremap.c index cd8a1b199ef9..6e3d857458de 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -152,6 +152,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, diff --git a/mm/rmap.c b/mm/rmap.c index ced14f1af6dc..c8993c63eb25 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -604,6 +604,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; + /* + * Ensure compiler does not re-order the setting of tlb_flush_batched + * before the PTE is cleared. + */ + barrier(); + mm->tlb_flush_batched = true; + /* * If the PTE was dirty then it's best to assume it's writable. The * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() @@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) return should_defer; } + +/* + * Reclaim unmaps pages under the PTL but do not flush the TLB prior to + * releasing the PTL if TLB flushes are batched. It's possible for a parallel + * operation such as mprotect or munmap to race between reclaim unmapping + * the page and flushing the page. If this race occurs, it potentially allows + * access to data via a stale TLB entry. Tracking all mm's that have TLB + * batching in flight would be expensive during reclaim so instead track + * whether TLB batching occurred in the past and if so then do a flush here + * if required. This will cost one additional flush per reclaim cycle paid + * by the first operation at risk such as mprotect and mumap. + * + * This must be called under the PTL so that an access to tlb_flush_batched + * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise + * via the PTL. + */ +void flush_tlb_batched_pending(struct mm_struct *mm) +{ + if (mm->tlb_flush_batched) { + flush_tlb_mm(mm); + + /* + * Do not allow the compiler to re-order the clearing of + * tlb_flush_batched before the tlb is flushed. + */ + barrier(); + mm->tlb_flush_batched = false; + } +} #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { -- cgit v1.2.3-71-gd317 From d16977f3a6cfbb5e9ce477f423a1bf343347c1ed Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Wed, 2 Aug 2017 13:32:01 -0700 Subject: kthread: fix documentation build warning The kerneldoc comment for kthread_create() had an incorrect argument name, leading to a warning in the docs build. Correct it, and make one more small step toward a warning-free build. Link: http://lkml.kernel.org/r/20170724135916.7f486c6f@lwn.net Signed-off-by: Jonathan Corbet Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 4fec8b775895..82e197eeac91 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -15,7 +15,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), * @threadfn: the function to run in the thread * @data: data pointer for @threadfn() * @namefmt: printf-style format string for the thread name - * @...: arguments for @namefmt. + * @arg...: arguments for @namefmt. * * This macro will create a kthread on the current node, leaving it in * the stopped state. This is just a helper for kthread_create_on_node(); -- cgit v1.2.3-71-gd317 From 89affbf5d9ebb15c6460596822e8857ea2f9e735 Mon Sep 17 00:00:00 2001 From: Dima Zavin Date: Wed, 2 Aug 2017 13:32:18 -0700 Subject: cpuset: fix a deadlock due to incomplete patching of cpusets_enabled() In codepaths that use the begin/retry interface for reading mems_allowed_seq with irqs disabled, there exists a race condition that stalls the patch process after only modifying a subset of the static_branch call sites. This problem manifested itself as a deadlock in the slub allocator, inside get_any_partial. The loop reads mems_allowed_seq value (via read_mems_allowed_begin), performs the defrag operation, and then verifies the consistency of mem_allowed via the read_mems_allowed_retry and the cookie returned by xxx_begin. The issue here is that both begin and retry first check if cpusets are enabled via cpusets_enabled() static branch. This branch can be rewritted dynamically (via cpuset_inc) if a new cpuset is created. The x86 jump label code fully synchronizes across all CPUs for every entry it rewrites. If it rewrites only one of the callsites (specifically the one in read_mems_allowed_retry) and then waits for the smp_call_function(do_sync_core) to complete while a CPU is inside the begin/retry section with IRQs off and the mems_allowed value is changed, we can hang. This is because begin() will always return 0 (since it wasn't patched yet) while retry() will test the 0 against the actual value of the seq counter. The fix is to use two different static keys: one for begin (pre_enable_key) and one for retry (enable_key). In cpuset_inc(), we first bump the pre_enable key to ensure that cpuset_mems_allowed_begin() always return a valid seqcount if are enabling cpusets. Similarly, when disabling cpusets via cpuset_dec(), we first ensure that callers of cpuset_mems_allowed_retry() will start ignoring the seqcount value before we let cpuset_mems_allowed_begin() return 0. The relevant stack traces of the two stuck threads: CPU: 1 PID: 1415 Comm: mkdir Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8817f9c28000 task.stack: ffffc9000ffa4000 RIP: smp_call_function_many+0x1f9/0x260 Call Trace: smp_call_function+0x3b/0x70 on_each_cpu+0x2f/0x90 text_poke_bp+0x87/0xd0 arch_jump_label_transform+0x93/0x100 __jump_label_update+0x77/0x90 jump_label_update+0xaa/0xc0 static_key_slow_inc+0x9e/0xb0 cpuset_css_online+0x70/0x2e0 online_css+0x2c/0xa0 cgroup_apply_control_enable+0x27f/0x3d0 cgroup_mkdir+0x2b7/0x420 kernfs_iop_mkdir+0x5a/0x80 vfs_mkdir+0xf6/0x1a0 SyS_mkdir+0xb7/0xe0 entry_SYSCALL_64_fastpath+0x18/0xad ... CPU: 2 PID: 1 Comm: init Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8818087c0000 task.stack: ffffc90000030000 RIP: int3+0x39/0x70 Call Trace: <#DB> ? ___slab_alloc+0x28b/0x5a0 ? copy_process.part.40+0xf7/0x1de0 __slab_alloc.isra.80+0x54/0x90 copy_process.part.40+0xf7/0x1de0 copy_process.part.40+0xf7/0x1de0 kmem_cache_alloc_node+0x8a/0x280 copy_process.part.40+0xf7/0x1de0 _do_fork+0xe7/0x6c0 _raw_spin_unlock_irq+0x2d/0x60 trace_hardirqs_on_caller+0x136/0x1d0 entry_SYSCALL_64_fastpath+0x5/0xad do_syscall_64+0x27/0x350 SyS_clone+0x19/0x20 do_syscall_64+0x60/0x350 entry_SYSCALL64_slow_path+0x25/0x25 Link: http://lkml.kernel.org/r/20170731040113.14197-1-dmitriyz@waymo.com Fixes: 46e700abc44c ("mm, page_alloc: remove unnecessary taking of a seqlock when cpusets are disabled") Signed-off-by: Dima Zavin Reported-by: Cliff Spradlin Acked-by: Vlastimil Babka Cc: Peter Zijlstra Cc: Christopher Lameter Cc: Li Zefan Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 19 +++++++++++++++++-- kernel/cgroup/cpuset.c | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 119a3f9604b0..898cfe2eeb42 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -18,6 +18,19 @@ #ifdef CONFIG_CPUSETS +/* + * Static branch rewrites can happen in an arbitrary order for a given + * key. In code paths where we need to loop with read_mems_allowed_begin() and + * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need + * to ensure that begin() always gets rewritten before retry() in the + * disabled -> enabled transition. If not, then if local irqs are disabled + * around the loop, we can deadlock since retry() would always be + * comparing the latest value of the mems_allowed seqcount against 0 as + * begin() still would see cpusets_enabled() as false. The enabled -> disabled + * transition should happen in reverse order for the same reasons (want to stop + * looking at real value of mems_allowed.sequence in retry() first). + */ +extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; static inline bool cpusets_enabled(void) { @@ -32,12 +45,14 @@ static inline int nr_cpusets(void) static inline void cpuset_inc(void) { + static_branch_inc(&cpusets_pre_enable_key); static_branch_inc(&cpusets_enabled_key); } static inline void cpuset_dec(void) { static_branch_dec(&cpusets_enabled_key); + static_branch_dec(&cpusets_pre_enable_key); } extern int cpuset_init(void); @@ -115,7 +130,7 @@ extern void cpuset_print_current_mems_allowed(void); */ static inline unsigned int read_mems_allowed_begin(void) { - if (!cpusets_enabled()) + if (!static_branch_unlikely(&cpusets_pre_enable_key)) return 0; return read_seqcount_begin(¤t->mems_allowed_seq); @@ -129,7 +144,7 @@ static inline unsigned int read_mems_allowed_begin(void) */ static inline bool read_mems_allowed_retry(unsigned int seq) { - if (!cpusets_enabled()) + if (!static_branch_unlikely(&cpusets_enabled_key)) return false; return read_seqcount_retry(¤t->mems_allowed_seq, seq); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca8376e5008c..8d5151688504 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -63,6 +63,7 @@ #include #include +DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ -- cgit v1.2.3-71-gd317 From 1ee1c3f5b5cff959e0ac95a125bd15eaf88cc638 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 2 Aug 2017 13:32:27 -0700 Subject: mm: allow page_cache_get_speculative in interrupt context Kernel panic when calling the IRQ-safe __get_user_pages_fast in NMI handler. The bug was introduced by commit 2947ba054a4d ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation"). The original x86 __get_user_page_fast used plain get_page() or page_ref_add(). However, the generic __get_user_page_fast uses page_cache_get_speculative(), which has VM_BUG_ON(in_interrupt()). There is no reason to prevent page_cache_get_speculative from using in interrupt context. According to the author, putting a BUG_ON there is just because the code is not verifying correctness of interrupt races. I did some tests in interrupt context. There is no issue found. Removing VM_BUG_ON(in_interrupt()) for page_cache_get_speculative(). Link: http://lkml.kernel.org/r/1501609146-59730-1-git-send-email-kan.liang@intel.com Fixes: 2947ba054a4d ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation") Signed-off-by: Kan Liang Cc: Jens Axboe Cc: Al Viro Cc: Kirill A. Shutemov Cc: Ying Huang Cc: Nicholas Piggin Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index baa9344dcd10..79b36f57c3ba 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -163,8 +163,6 @@ void release_pages(struct page **pages, int nr, bool cold); */ static inline int page_cache_get_speculative(struct page *page) { - VM_BUG_ON(in_interrupt()); - #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic() && !irqs_disabled()); -- cgit v1.2.3-71-gd317 From e1a10ef7fa876f8510aaec36ea5c0cf34baba410 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 3 Aug 2017 09:19:52 -0400 Subject: tcp: introduce tcp_rto_delta_us() helper for xmit timer fix Pure refactor. This helper will be required in the xmit timer fix later in the patch series. (Because the TLP logic will want to make this calculation.) Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 10 ++++++++++ net/ipv4/tcp_input.c | 5 +---- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 70483296157f..ada65e767b28 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1916,6 +1916,16 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); +/* At how many usecs into the future should the RTO fire? */ +static inline s64 tcp_rto_delta_us(const struct sock *sk) +{ + const struct sk_buff *skb = tcp_write_queue_head(sk); + u32 rto = inet_csk(sk)->icsk_rto; + u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); + + return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; +} + /* * Save and compile IPv4 options, return a pointer to it */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dad026fcfd09..b9ba8d55d8b8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3004,10 +3004,7 @@ void tcp_rearm_rto(struct sock *sk) /* Offset the time elapsed after installing regular RTO */ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { - struct sk_buff *skb = tcp_write_queue_head(sk); - u64 rto_time_stamp = skb->skb_mstamp + - jiffies_to_usecs(rto); - s64 delta_us = rto_time_stamp - tp->tcp_mstamp; + s64 delta_us = tcp_rto_delta_us(sk); /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ -- cgit v1.2.3-71-gd317 From 931b3c1a832621b4bdcbaf783096fc267eb36fbe Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 1 Aug 2017 09:41:37 +0300 Subject: RDMA/mlx5: Fix existence check for extended address vector The extended address vector is the highest bit in be32 variable, but it was compared with the lowest. This patch fixes the endianness of that check and removes already declared define. Fixes: 17d2f88f92ce ("IB/mlx5: Add ODP atomics support") Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/odp.c | 2 +- include/linux/mlx5/qp.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index ae0746754008..3d701c7a4c91 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -939,7 +939,7 @@ static int mlx5_ib_mr_initiator_pfault_handler( if (qp->ibqp.qp_type != IB_QPT_RC) { av = *wqe; - if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT)) + if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) *wqe += sizeof(struct mlx5_av); else *wqe += sizeof(struct mlx5_base_av); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 6f41270d80c0..f378dc0e7eaf 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -212,7 +212,6 @@ struct mlx5_wqe_ctrl_seg { #define MLX5_WQE_CTRL_OPCODE_MASK 0xff #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 -#define MLX5_WQE_AV_EXT 0x80000000 enum { MLX5_ETH_WQE_L3_INNER_CSUM = 1 << 4, -- cgit v1.2.3-71-gd317 From 978d13d60c34818a41fc35962602bdfa5c03f214 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Fri, 4 Aug 2017 23:59:31 -0700 Subject: iscsi-target: Fix iscsi_np reset hung task during parallel delete This patch fixes a bug associated with iscsit_reset_np_thread() that can occur during parallel configfs rmdir of a single iscsi_np used across multiple iscsi-target instances, that would result in hung task(s) similar to below where configfs rmdir process context was blocked indefinately waiting for iscsi_np->np_restart_comp to finish: [ 6726.112076] INFO: task dcp_proxy_node_:15550 blocked for more than 120 seconds. [ 6726.119440] Tainted: G W O 4.1.26-3321 #2 [ 6726.125045] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 6726.132927] dcp_proxy_node_ D ffff8803f202bc88 0 15550 1 0x00000000 [ 6726.140058] ffff8803f202bc88 ffff88085c64d960 ffff88083b3b1ad0 ffff88087fffeb08 [ 6726.147593] ffff8803f202c000 7fffffffffffffff ffff88083f459c28 ffff88083b3b1ad0 [ 6726.155132] ffff88035373c100 ffff8803f202bca8 ffffffff8168ced2 ffff8803f202bcb8 [ 6726.162667] Call Trace: [ 6726.165150] [] schedule+0x32/0x80 [ 6726.170156] [] schedule_timeout+0x214/0x290 [ 6726.176030] [] ? __send_signal+0x52/0x4a0 [ 6726.181728] [] wait_for_completion+0x96/0x100 [ 6726.187774] [] ? wake_up_state+0x10/0x10 [ 6726.193395] [] iscsit_reset_np_thread+0x62/0xe0 [iscsi_target_mod] [ 6726.201278] [] iscsit_tpg_disable_portal_group+0x96/0x190 [iscsi_target_mod] [ 6726.210033] [] lio_target_tpg_store_enable+0x4f/0xc0 [iscsi_target_mod] [ 6726.218351] [] configfs_write_file+0xaa/0x110 [ 6726.224392] [] vfs_write+0xa4/0x1b0 [ 6726.229576] [] SyS_write+0x41/0xb0 [ 6726.234659] [] system_call_fastpath+0x12/0x71 It would happen because each iscsit_reset_np_thread() sets state to ISCSI_NP_THREAD_RESET, sends SIGINT, and then blocks waiting for completion on iscsi_np->np_restart_comp. However, if iscsi_np was active processing a login request and more than a single iscsit_reset_np_thread() caller to the same iscsi_np was blocked on iscsi_np->np_restart_comp, iscsi_np kthread process context in __iscsi_target_login_thread() would flush pending signals and only perform a single completion of np->np_restart_comp before going back to sleep within transport specific iscsit_transport->iscsi_accept_np code. To address this bug, add a iscsi_np->np_reset_count and update __iscsi_target_login_thread() to keep completing np->np_restart_comp until ->np_reset_count has reached zero. Reported-by: Gary Guo Tested-by: Gary Guo Cc: Mike Christie Cc: Hannes Reinecke Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Nicholas Bellinger --- drivers/target/iscsi/iscsi_target.c | 1 + drivers/target/iscsi/iscsi_target_login.c | 7 +++++-- include/target/iscsi/iscsi_target_core.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 12803de99400..5001261f5d69 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -418,6 +418,7 @@ int iscsit_reset_np_thread( return 0; } np->np_thread_state = ISCSI_NP_THREAD_RESET; + atomic_inc(&np->np_reset_count); if (np->np_thread) { spin_unlock_bh(&np->np_thread_lock); diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index e9bdc8b86e7d..dc13afbd4c88 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -1243,9 +1243,11 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) flush_signals(current); spin_lock_bh(&np->np_thread_lock); - if (np->np_thread_state == ISCSI_NP_THREAD_RESET) { + if (atomic_dec_if_positive(&np->np_reset_count) >= 0) { np->np_thread_state = ISCSI_NP_THREAD_ACTIVE; + spin_unlock_bh(&np->np_thread_lock); complete(&np->np_restart_comp); + return 1; } else if (np->np_thread_state == ISCSI_NP_THREAD_SHUTDOWN) { spin_unlock_bh(&np->np_thread_lock); goto exit; @@ -1278,7 +1280,8 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) goto exit; } else if (rc < 0) { spin_lock_bh(&np->np_thread_lock); - if (np->np_thread_state == ISCSI_NP_THREAD_RESET) { + if (atomic_dec_if_positive(&np->np_reset_count) >= 0) { + np->np_thread_state = ISCSI_NP_THREAD_ACTIVE; spin_unlock_bh(&np->np_thread_lock); complete(&np->np_restart_comp); iscsit_put_transport(conn->conn_transport); diff --git a/include/target/iscsi/iscsi_target_core.h b/include/target/iscsi/iscsi_target_core.h index 0ca1fb08805b..fb87d32f5e51 100644 --- a/include/target/iscsi/iscsi_target_core.h +++ b/include/target/iscsi/iscsi_target_core.h @@ -786,6 +786,7 @@ struct iscsi_np { int np_sock_type; enum np_thread_state_table np_thread_state; bool enabled; + atomic_t np_reset_count; enum iscsi_timer_flags_table np_login_timer_flags; u32 np_exports; enum np_flags_table np_flags; -- cgit v1.2.3-71-gd317 From 0cca6c8920ade95e2741b2062cf1397dc546fb0f Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Sun, 6 Aug 2017 16:00:05 +0200 Subject: pinctrl: generic: update references to Documentation/pinctrl.txt Update deprecated references to Documentation/pinctrl.txt since it has been moved to Documentation/driver-api/pinctl.rst. Signed-off-by: Ludovic Desroches Fixes: 5a9b73832e9e ("pinctrl.txt: move it to the driver-api book") Signed-off-by: Linus Walleij --- Documentation/gpio/gpio-legacy.txt | 2 +- MAINTAINERS | 2 +- include/linux/device.h | 2 +- include/linux/pinctrl/pinconf-generic.h | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/Documentation/gpio/gpio-legacy.txt b/Documentation/gpio/gpio-legacy.txt index b34fd94f7089..5eacc147ea87 100644 --- a/Documentation/gpio/gpio-legacy.txt +++ b/Documentation/gpio/gpio-legacy.txt @@ -459,7 +459,7 @@ pin controller? This is done by registering "ranges" of pins, which are essentially cross-reference tables. These are described in -Documentation/pinctrl.txt +Documentation/driver-api/pinctl.rst While the pin allocation is totally managed by the pinctrl subsystem, gpio (under gpiolib) is still maintained by gpio drivers. It may happen diff --git a/MAINTAINERS b/MAINTAINERS index f66488dfdbc9..2c85558aec19 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10375,7 +10375,7 @@ L: linux-gpio@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl.git S: Maintained F: Documentation/devicetree/bindings/pinctrl/ -F: Documentation/pinctrl.txt +F: Documentation/driver-api/pinctl.rst F: drivers/pinctrl/ F: include/linux/pinctrl/ diff --git a/include/linux/device.h b/include/linux/device.h index 723cd54b94da..beabdbc08420 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -843,7 +843,7 @@ struct dev_links_info { * hibernation, system resume and during runtime PM transitions * along with subsystem-level and driver-level callbacks. * @pins: For device pin management. - * See Documentation/pinctrl.txt for details. + * See Documentation/driver-api/pinctl.rst for details. * @msi_list: Hosts MSI descriptors * @msi_domain: The generic MSI domain this device is using. * @numa_node: NUMA node this device is close to. diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 231d3075815a..e91d1b6a260d 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -81,8 +81,8 @@ * it. * @PIN_CONFIG_OUTPUT: this will configure the pin as an output and drive a * value on the line. Use argument 1 to indicate high level, argument 0 to - * indicate low level. (Please see Documentation/pinctrl.txt, section - * "GPIO mode pitfalls" for a discussion around this parameter.) + * indicate low level. (Please see Documentation/driver-api/pinctl.rst, + * section "GPIO mode pitfalls" for a discussion around this parameter.) * @PIN_CONFIG_POWER_SOURCE: if the pin can select between different power * supplies, the argument to this parameter (on a custom format) tells * the driver which alternative power source to use. -- cgit v1.2.3-71-gd317 From 04c2cf34362f133be09878bd752f8b014318b59a Mon Sep 17 00:00:00 2001 From: Naftali Goldstein Date: Tue, 11 Jul 2017 10:07:25 +0300 Subject: mac80211: add api to start ba session timer expired flow Some drivers handle rx buffer reordering internally (and by extension handle also the rx ba session timer internally), but do not ofload the addba/delba negotiation. Add an api for these drivers to properly tear-down the ba session, including sending a delba. Signed-off-by: Naftali Goldstein Signed-off-by: Luca Coelho --- include/net/mac80211.h | 15 +++++++++++++++ net/mac80211/agg-rx.c | 22 +++++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b2b5419467cc..f8149ca192b4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5499,6 +5499,21 @@ static inline void ieee80211_stop_rx_ba_session_offl(struct ieee80211_vif *vif, ieee80211_manage_rx_ba_offl(vif, addr, tid + IEEE80211_NUM_TIDS); } +/** + * ieee80211_rx_ba_timer_expired - stop a Rx BA session due to timeout + * + * Some device drivers do not offload AddBa/DelBa negotiation, but handle rx + * buffer reording internally, and therefore also handle the session timer. + * + * Trigger the timeout flow, which sends a DelBa. + * + * @vif: &struct ieee80211_vif pointer from the add_interface callback + * @addr: station mac address + * @tid: the rx tid + */ +void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif, + const u8 *addr, unsigned int tid); + /* Rate control API */ /** diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 8708cbe8af5b..2b36eff5d97e 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -7,7 +7,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation - * Copyright(c) 2015 Intel Deutschland GmbH + * Copyright(c) 2015-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -466,3 +466,23 @@ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_manage_rx_ba_offl); + +void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif, + const u8 *addr, unsigned int tid) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + rcu_read_lock(); + sta = sta_info_get_bss(sdata, addr); + if (!sta) + goto unlock; + + set_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired); + ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work); + + unlock: + rcu_read_unlock(); +} +EXPORT_SYMBOL(ieee80211_rx_ba_timer_expired); -- cgit v1.2.3-71-gd317 From cb87481ee89dbd6609e227afbf64900fb4e5c930 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 26 Jul 2017 22:46:27 +1000 Subject: kbuild: linker script do not match C names unless LD_DEAD_CODE_DATA_ELIMINATION is configured The .data and .bss sections were modified in the generic linker script to pull in sections named .data., which are generated by gcc with -ffunction-sections and -fdata-sections options. The problem with this pattern is it can also match section names that Linux defines explicitly, e.g., .data.unlikely. This can cause Linux sections to get moved into the wrong place. The way to avoid this is to use ".." separators for explicit section names (the dot character is valid in a section name but not a C identifier). However currently there are sections which don't follow this rule, so for now just disable the wild card by default. Example: http://marc.info/?l=linux-arm-kernel&m=150106824024221&w=2 Cc: # 4.9 Fixes: b67067f1176df ("kbuild: allow archs to select link dead code/data elimination") Signed-off-by: Nicholas Piggin Signed-off-by: Masahiro Yamada --- include/asm-generic/vmlinux.lds.h | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index da0be9a8d1de..9623d78f8494 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -59,6 +59,22 @@ /* Align . to a 8 byte boundary equals to maximum function alignment. */ #define ALIGN_FUNCTION() . = ALIGN(8) +/* + * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which + * generates .data.identifier sections, which need to be pulled in with + * .data. We don't want to pull in .data..other sections, which Linux + * has defined. Same for text and bss. + */ +#ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION +#define TEXT_MAIN .text .text.[0-9a-zA-Z_]* +#define DATA_MAIN .data .data.[0-9a-zA-Z_]* +#define BSS_MAIN .bss .bss.[0-9a-zA-Z_]* +#else +#define TEXT_MAIN .text +#define DATA_MAIN .data +#define BSS_MAIN .bss +#endif + /* * Align to a 32 byte boundary equal to the * alignment gcc 4.5 uses for a struct @@ -198,12 +214,9 @@ /* * .data section - * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections generates - * .data.identifier which needs to be pulled in with .data, but don't want to - * pull in .data..stuff which has its own requirements. Same for bss. */ #define DATA_DATA \ - *(.data .data.[0-9a-zA-Z_]*) \ + *(DATA_MAIN) \ *(.ref.data) \ *(.data..shared_aligned) /* percpu related */ \ MEM_KEEP(init.data) \ @@ -434,16 +447,17 @@ VMLINUX_SYMBOL(__security_initcall_end) = .; \ } -/* .text section. Map to function alignment to avoid address changes +/* + * .text section. Map to function alignment to avoid address changes * during second ld run in second ld pass when generating System.map - * LD_DEAD_CODE_DATA_ELIMINATION option enables -ffunction-sections generates - * .text.identifier which needs to be pulled in with .text , but some - * architectures define .text.foo which is not intended to be pulled in here. - * Those enabling LD_DEAD_CODE_DATA_ELIMINATION must ensure they don't have - * conflicting section names, and must pull in .text.[0-9a-zA-Z_]* */ + * + * TEXT_MAIN here will match .text.fixup and .text.unlikely if dead + * code elimination is enabled, so these sections should be converted + * to use ".." first. + */ #define TEXT_TEXT \ ALIGN_FUNCTION(); \ - *(.text.hot .text .text.fixup .text.unlikely) \ + *(.text.hot TEXT_MAIN .text.fixup .text.unlikely) \ *(.ref.text) \ MEM_KEEP(init.text) \ MEM_KEEP(exit.text) \ @@ -613,7 +627,7 @@ BSS_FIRST_SECTIONS \ *(.bss..page_aligned) \ *(.dynbss) \ - *(.bss .bss.[0-9a-zA-Z_]*) \ + *(BSS_MAIN) \ *(COMMON) \ } -- cgit v1.2.3-71-gd317 From 0fb228d30b8d72bfee51f57e638d412324d44a11 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 1 Aug 2017 15:12:39 -0700 Subject: nvmet_fc: add defer_req callback for deferment of cmd buffer return At queue creation, the transport allocates a local job struct (struct nvmet_fc_fcp_iod) for each possible element of the queue. When a new CMD is received from the wire, a jobs struct is allocated from the queue and then used for the duration of the command. The job struct contains buffer space for the wire command iu. Thus, upon allocation of the job struct, the cmd iu buffer is copied to the job struct and the LLDD may immediately free/reuse the CMD IU buffer passed in the call. However, in some circumstances, due to the packetized nature of FC and the api of the FC LLDD which may issue a hw command to send the wire response, but the LLDD may not get the hw completion for the command and upcall the nvmet_fc layer before a new command may be asynchronously received on the wire. In other words, its possible for the initiator to get the response from the wire, thus believe a command slot free, and send a new command iu. The new command iu may be received by the LLDD and passed to the transport before the LLDD had serviced the hw completion and made the teardown calls for the original job struct. As such, there is no available job struct available for the new io. E.g. it appears like the host sent more queue elements than the queue size. It didn't based on it's understanding. Rather than treat this as a hard connection failure queue the new request until the job struct does free up. As the buffer isn't copied as there's no job struct, a special return value must be returned to the LLDD to signify to hold off on recycling the cmd iu buffer. And later, when a job struct is allocated and the buffer copied, a new LLDD callback is introduced to notify the LLDD and allow it to recycle it's command iu buffer. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 212 +++++++++++++++++++++++++++++++++++------ include/linux/nvme-fc-driver.h | 7 ++ 2 files changed, 191 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 31ca55dfcb1d..1b7f2520a20d 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -114,6 +114,11 @@ struct nvmet_fc_tgtport { struct kref ref; }; +struct nvmet_fc_defer_fcp_req { + struct list_head req_list; + struct nvmefc_tgt_fcp_req *fcp_req; +}; + struct nvmet_fc_tgt_queue { bool ninetypercent; u16 qid; @@ -132,6 +137,8 @@ struct nvmet_fc_tgt_queue { struct nvmet_fc_tgt_assoc *assoc; struct nvmet_fc_fcp_iod *fod; /* array of fcp_iods */ struct list_head fod_list; + struct list_head pending_cmd_list; + struct list_head avail_defer_list; struct workqueue_struct *work_q; struct kref ref; } __aligned(sizeof(unsigned long long)); @@ -223,6 +230,8 @@ static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue); static int nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue); static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport); static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport); +static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_fcp_iod *fod); /* *********************** FC-NVME DMA Handling **************************** */ @@ -463,9 +472,9 @@ static struct nvmet_fc_fcp_iod * nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue) { static struct nvmet_fc_fcp_iod *fod; - unsigned long flags; - spin_lock_irqsave(&queue->qlock, flags); + lockdep_assert_held(&queue->qlock); + fod = list_first_entry_or_null(&queue->fod_list, struct nvmet_fc_fcp_iod, fcp_list); if (fod) { @@ -477,17 +486,37 @@ nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue) * will "inherit" that reference. */ } - spin_unlock_irqrestore(&queue->qlock, flags); return fod; } +static void +nvmet_fc_queue_fcp_req(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_tgt_queue *queue, + struct nvmefc_tgt_fcp_req *fcpreq) +{ + struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private; + + /* + * put all admin cmds on hw queue id 0. All io commands go to + * the respective hw queue based on a modulo basis + */ + fcpreq->hwqid = queue->qid ? + ((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0; + + if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR) + queue_work_on(queue->cpu, queue->work_q, &fod->work); + else + nvmet_fc_handle_fcp_rqst(tgtport, fod); +} + static void nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue, struct nvmet_fc_fcp_iod *fod) { struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq; struct nvmet_fc_tgtport *tgtport = fod->tgtport; + struct nvmet_fc_defer_fcp_req *deferfcp; unsigned long flags; fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma, @@ -495,21 +524,56 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue, fcpreq->nvmet_fc_private = NULL; - spin_lock_irqsave(&queue->qlock, flags); - list_add_tail(&fod->fcp_list, &fod->queue->fod_list); fod->active = false; fod->abort = false; fod->aborted = false; fod->writedataactive = false; fod->fcpreq = NULL; + + tgtport->ops->fcp_req_release(&tgtport->fc_target_port, fcpreq); + + spin_lock_irqsave(&queue->qlock, flags); + deferfcp = list_first_entry_or_null(&queue->pending_cmd_list, + struct nvmet_fc_defer_fcp_req, req_list); + if (!deferfcp) { + list_add_tail(&fod->fcp_list, &fod->queue->fod_list); + spin_unlock_irqrestore(&queue->qlock, flags); + + /* Release reference taken at queue lookup and fod allocation */ + nvmet_fc_tgt_q_put(queue); + return; + } + + /* Re-use the fod for the next pending cmd that was deferred */ + list_del(&deferfcp->req_list); + + fcpreq = deferfcp->fcp_req; + + /* deferfcp can be reused for another IO at a later date */ + list_add_tail(&deferfcp->req_list, &queue->avail_defer_list); + spin_unlock_irqrestore(&queue->qlock, flags); + /* Save NVME CMD IO in fod */ + memcpy(&fod->cmdiubuf, fcpreq->rspaddr, fcpreq->rsplen); + + /* Setup new fcpreq to be processed */ + fcpreq->rspaddr = NULL; + fcpreq->rsplen = 0; + fcpreq->nvmet_fc_private = fod; + fod->fcpreq = fcpreq; + fod->active = true; + + /* inform LLDD IO is now being processed */ + tgtport->ops->defer_rcv(&tgtport->fc_target_port, fcpreq); + + /* Submit deferred IO for processing */ + nvmet_fc_queue_fcp_req(tgtport, queue, fcpreq); + /* - * release the reference taken at queue lookup and fod allocation + * Leave the queue lookup get reference taken when + * fod was originally allocated. */ - nvmet_fc_tgt_q_put(queue); - - tgtport->ops->fcp_req_release(&tgtport->fc_target_port, fcpreq); } static int @@ -569,6 +633,8 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, queue->port = assoc->tgtport->port; queue->cpu = nvmet_fc_queue_to_cpu(assoc->tgtport, qid); INIT_LIST_HEAD(&queue->fod_list); + INIT_LIST_HEAD(&queue->avail_defer_list); + INIT_LIST_HEAD(&queue->pending_cmd_list); atomic_set(&queue->connected, 0); atomic_set(&queue->sqtail, 0); atomic_set(&queue->rsn, 1); @@ -638,6 +704,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) { struct nvmet_fc_tgtport *tgtport = queue->assoc->tgtport; struct nvmet_fc_fcp_iod *fod = queue->fod; + struct nvmet_fc_defer_fcp_req *deferfcp; unsigned long flags; int i, writedataactive; bool disconnect; @@ -666,6 +733,35 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) } } } + + /* Cleanup defer'ed IOs in queue */ + list_for_each_entry(deferfcp, &queue->avail_defer_list, req_list) { + list_del(&deferfcp->req_list); + kfree(deferfcp); + } + + for (;;) { + deferfcp = list_first_entry_or_null(&queue->pending_cmd_list, + struct nvmet_fc_defer_fcp_req, req_list); + if (!deferfcp) + break; + + list_del(&deferfcp->req_list); + spin_unlock_irqrestore(&queue->qlock, flags); + + tgtport->ops->defer_rcv(&tgtport->fc_target_port, + deferfcp->fcp_req); + + tgtport->ops->fcp_abort(&tgtport->fc_target_port, + deferfcp->fcp_req); + + tgtport->ops->fcp_req_release(&tgtport->fc_target_port, + deferfcp->fcp_req); + + kfree(deferfcp); + + spin_lock_irqsave(&queue->qlock, flags); + } spin_unlock_irqrestore(&queue->qlock, flags); flush_workqueue(queue->work_q); @@ -2172,11 +2268,38 @@ nvmet_fc_handle_fcp_rqst_work(struct work_struct *work) * Pass a FC-NVME FCP CMD IU received from the FC link to the nvmet-fc * layer for processing. * - * The nvmet-fc layer will copy cmd payload to an internal structure for - * processing. As such, upon completion of the routine, the LLDD may - * immediately free/reuse the CMD IU buffer passed in the call. + * The nvmet_fc layer allocates a local job structure (struct + * nvmet_fc_fcp_iod) from the queue for the io and copies the + * CMD IU buffer to the job structure. As such, on a successful + * completion (returns 0), the LLDD may immediately free/reuse + * the CMD IU buffer passed in the call. + * + * However, in some circumstances, due to the packetized nature of FC + * and the api of the FC LLDD which may issue a hw command to send the + * response, but the LLDD may not get the hw completion for that command + * and upcall the nvmet_fc layer before a new command may be + * asynchronously received - its possible for a command to be received + * before the LLDD and nvmet_fc have recycled the job structure. It gives + * the appearance of more commands received than fits in the sq. + * To alleviate this scenario, a temporary queue is maintained in the + * transport for pending LLDD requests waiting for a queue job structure. + * In these "overrun" cases, a temporary queue element is allocated + * the LLDD request and CMD iu buffer information remembered, and the + * routine returns a -EOVERFLOW status. Subsequently, when a queue job + * structure is freed, it is immediately reallocated for anything on the + * pending request list. The LLDDs defer_rcv() callback is called, + * informing the LLDD that it may reuse the CMD IU buffer, and the io + * is then started normally with the transport. * - * If this routine returns error, the lldd should abort the exchange. + * The LLDD, when receiving an -EOVERFLOW completion status, is to treat + * the completion as successful but must not reuse the CMD IU buffer + * until the LLDD's defer_rcv() callback has been called for the + * corresponding struct nvmefc_tgt_fcp_req pointer. + * + * If there is any other condition in which an error occurs, the + * transport will return a non-zero status indicating the error. + * In all cases other than -EOVERFLOW, the transport has not accepted the + * request and the LLDD should abort the exchange. * * @target_port: pointer to the (registered) target port the FCP CMD IU * was received on. @@ -2194,6 +2317,8 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port, struct nvme_fc_cmd_iu *cmdiu = cmdiubuf; struct nvmet_fc_tgt_queue *queue; struct nvmet_fc_fcp_iod *fod; + struct nvmet_fc_defer_fcp_req *deferfcp; + unsigned long flags; /* validate iu, so the connection id can be used to find the queue */ if ((cmdiubuf_len != sizeof(*cmdiu)) || @@ -2214,29 +2339,60 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port, * when the fod is freed. */ + spin_lock_irqsave(&queue->qlock, flags); + fod = nvmet_fc_alloc_fcp_iod(queue); - if (!fod) { + if (fod) { + spin_unlock_irqrestore(&queue->qlock, flags); + + fcpreq->nvmet_fc_private = fod; + fod->fcpreq = fcpreq; + + memcpy(&fod->cmdiubuf, cmdiubuf, cmdiubuf_len); + + nvmet_fc_queue_fcp_req(tgtport, queue, fcpreq); + + return 0; + } + + if (!tgtport->ops->defer_rcv) { + spin_unlock_irqrestore(&queue->qlock, flags); /* release the queue lookup reference */ nvmet_fc_tgt_q_put(queue); return -ENOENT; } - fcpreq->nvmet_fc_private = fod; - fod->fcpreq = fcpreq; - /* - * put all admin cmds on hw queue id 0. All io commands go to - * the respective hw queue based on a modulo basis - */ - fcpreq->hwqid = queue->qid ? - ((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0; - memcpy(&fod->cmdiubuf, cmdiubuf, cmdiubuf_len); + deferfcp = list_first_entry_or_null(&queue->avail_defer_list, + struct nvmet_fc_defer_fcp_req, req_list); + if (deferfcp) { + /* Just re-use one that was previously allocated */ + list_del(&deferfcp->req_list); + } else { + spin_unlock_irqrestore(&queue->qlock, flags); - if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR) - queue_work_on(queue->cpu, queue->work_q, &fod->work); - else - nvmet_fc_handle_fcp_rqst(tgtport, fod); + /* Now we need to dynamically allocate one */ + deferfcp = kmalloc(sizeof(*deferfcp), GFP_KERNEL); + if (!deferfcp) { + /* release the queue lookup reference */ + nvmet_fc_tgt_q_put(queue); + return -ENOMEM; + } + spin_lock_irqsave(&queue->qlock, flags); + } - return 0; + /* For now, use rspaddr / rsplen to save payload information */ + fcpreq->rspaddr = cmdiubuf; + fcpreq->rsplen = cmdiubuf_len; + deferfcp->fcp_req = fcpreq; + + /* defer processing till a fod becomes available */ + list_add_tail(&deferfcp->req_list, &queue->pending_cmd_list); + + /* NOTE: the queue lookup reference is still valid */ + + spin_unlock_irqrestore(&queue->qlock, flags); + + return -EOVERFLOW; } EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_req); diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 6c8c5d8041b7..2591878c1d48 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -346,6 +346,11 @@ struct nvme_fc_remote_port { * indicating an FC transport Aborted status. * Entrypoint is Mandatory. * + * @defer_rcv: Called by the transport to signal the LLLD that it has + * begun processing of a previously received NVME CMD IU. The LLDD + * is now free to re-use the rcv buffer associated with the + * nvmefc_tgt_fcp_req. + * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -846,6 +851,8 @@ struct nvmet_fc_target_template { struct nvmefc_tgt_fcp_req *fcpreq); void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq); + void (*defer_rcv)(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *fcpreq); u32 max_hw_queues; u16 max_sgl_segments; -- cgit v1.2.3-71-gd317 From bfe334924ccd9f4a53f30240c03cf2f43f5b2df1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Aug 2017 19:39:30 +0200 Subject: perf/x86: Fix RDPMC vs. mm_struct tracking Vince reported the following rdpmc() testcase failure: > Failing test case: > > fd=perf_event_open(); > addr=mmap(fd); > exec() // without closing or unmapping the event > fd=perf_event_open(); > addr=mmap(fd); > rdpmc() // GPFs due to rdpmc being disabled The problem is of course that exec() plays tricks with what is current->mm, only destroying the old mappings after having installed the new mm. Fix this confusion by passing along vma->vm_mm instead of relying on current->mm. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 1e0fb9ec679c ("perf: Add pmu callbacks to track event mapping and unmapping") Link: http://lkml.kernel.org/r/20170802173930.cstykcqefmqt7jau@hirez.programming.kicks-ass.net [ Minor cleanups. ] Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 16 +++++++--------- include/linux/perf_event.h | 4 ++-- kernel/events/core.c | 6 +++--- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8e3db8f642a7..af12e294caed 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2114,7 +2114,7 @@ static void refresh_pce(void *ignored) load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm)); } -static void x86_pmu_event_mapped(struct perf_event *event) +static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; @@ -2129,22 +2129,20 @@ static void x86_pmu_event_mapped(struct perf_event *event) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_exclusive(¤t->mm->mmap_sem); + lockdep_assert_held_exclusive(&mm->mmap_sem); - if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); + if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) + on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); } -static void x86_pmu_event_unmapped(struct perf_event *event) +static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - if (!current->mm) - return; if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; - if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); + if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) + on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a3b873fc59e4..b14095bcf4bb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -310,8 +310,8 @@ struct pmu { * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ - void (*event_mapped) (struct perf_event *event); /*optional*/ - void (*event_unmapped) (struct perf_event *event); /*optional*/ + void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ + void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..a654b8a3586f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5090,7 +5090,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -5113,7 +5113,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event); + event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and @@ -5411,7 +5411,7 @@ aux_unlock: vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); return ret; } -- cgit v1.2.3-71-gd317 From 16af97dc5a8975371a83d9e30a64038b48f40a2d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 10 Aug 2017 15:23:56 -0700 Subject: mm: migrate: prevent racy access to tlb_flush_pending Patch series "fixes of TLB batching races", v6. It turns out that Linux TLB batching mechanism suffers from various races. Races that are caused due to batching during reclamation were recently handled by Mel and this patch-set deals with others. The more fundamental issue is that concurrent updates of the page-tables allow for TLB flushes to be batched on one core, while another core changes the page-tables. This other core may assume a PTE change does not require a flush based on the updated PTE value, while it is unaware that TLB flushes are still pending. This behavior affects KSM (which may result in memory corruption) and MADV_FREE and MADV_DONTNEED (which may result in incorrect behavior). A proof-of-concept can easily produce the wrong behavior of MADV_DONTNEED. Memory corruption in KSM is harder to produce in practice, but was observed by hacking the kernel and adding a delay before flushing and replacing the KSM page. Finally, there is also one memory barrier missing, which may affect architectures with weak memory model. This patch (of 7): Setting and clearing mm->tlb_flush_pending can be performed by multiple threads, since mmap_sem may only be acquired for read in task_numa_work(). If this happens, tlb_flush_pending might be cleared while one of the threads still changes PTEs and batches TLB flushes. This can lead to the same race between migration and change_protection_range() that led to the introduction of tlb_flush_pending. The result of this race was data corruption, which means that this patch also addresses a theoretically possible data corruption. An actual data corruption was not observed, yet the race was was confirmed by adding assertion to check tlb_flush_pending is not set by two threads, adding artificial latency in change_protection_range() and using sysctl to reduce kernel.numa_balancing_scan_delay_ms. Link: http://lkml.kernel.org/r/20170802000818.4760-2-namit@vmware.com Fixes: 20841405940e ("mm: fix TLB flush race between migration, and change_protection_range") Signed-off-by: Nadav Amit Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Minchan Kim Cc: Andy Lutomirski Cc: Hugh Dickins Cc: "David S. Miller" Cc: Andrea Arcangeli Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Russell King Cc: Sergey Senozhatsky Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 31 ++++++++++++++++++++++--------- kernel/fork.c | 2 +- mm/debug.c | 2 +- mm/mprotect.c | 4 ++-- 4 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f384bb62d8e..f58f76ee1dfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -493,7 +493,7 @@ struct mm_struct { * can move process memory needs to flush the TLB when moving a * PROT_NONE or PROT_NUMA mapped page. */ - bool tlb_flush_pending; + atomic_t tlb_flush_pending; #endif #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ @@ -532,33 +532,46 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { barrier(); - return mm->tlb_flush_pending; + return atomic_read(&mm->tlb_flush_pending) > 0; } -static inline void set_tlb_flush_pending(struct mm_struct *mm) + +static inline void init_tlb_flush_pending(struct mm_struct *mm) { - mm->tlb_flush_pending = true; + atomic_set(&mm->tlb_flush_pending, 0); +} + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_inc(&mm->tlb_flush_pending); /* - * Guarantee that the tlb_flush_pending store does not leak into the + * Guarantee that the tlb_flush_pending increase does not leak into the * critical section updating the page tables */ smp_mb__before_spinlock(); } + /* Clearing is done after a TLB flush, which also provides a barrier. */ -static inline void clear_tlb_flush_pending(struct mm_struct *mm) +static inline void dec_tlb_flush_pending(struct mm_struct *mm) { barrier(); - mm->tlb_flush_pending = false; + atomic_dec(&mm->tlb_flush_pending); } #else static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { return false; } -static inline void set_tlb_flush_pending(struct mm_struct *mm) + +static inline void init_tlb_flush_pending(struct mm_struct *mm) { } -static inline void clear_tlb_flush_pending(struct mm_struct *mm) + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ +} + +static inline void dec_tlb_flush_pending(struct mm_struct *mm) { } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 17921b0390b4..e075b7780421 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -807,7 +807,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_aio(mm); mm_init_owner(mm, p); mmu_notifier_mm_init(mm); - clear_tlb_flush_pending(mm); + init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif diff --git a/mm/debug.c b/mm/debug.c index db1cd26d8752..d70103bb4731 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -159,7 +159,7 @@ void dump_mm(const struct mm_struct *mm) mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, #endif #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) - mm->tlb_flush_pending, + atomic_read(&mm->tlb_flush_pending), #endif mm->def_flags, &mm->def_flags ); diff --git a/mm/mprotect.c b/mm/mprotect.c index 4180ad8cc9c5..bd0f409922cb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -244,7 +244,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - set_tlb_flush_pending(mm); + inc_tlb_flush_pending(mm); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -256,7 +256,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, /* Only flush the TLB if we actually modified any entries: */ if (pages) flush_tlb_range(vma, start, end); - clear_tlb_flush_pending(mm); + dec_tlb_flush_pending(mm); return pages; } -- cgit v1.2.3-71-gd317 From 0a2c40487f3e4215c6ab46e7f837036badfb542b Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 10 Aug 2017 15:23:59 -0700 Subject: mm: migrate: fix barriers around tlb_flush_pending Reading tlb_flush_pending while the page-table lock is taken does not require a barrier, since the lock/unlock already acts as a barrier. Removing the barrier in mm_tlb_flush_pending() to address this issue. However, migrate_misplaced_transhuge_page() calls mm_tlb_flush_pending() while the page-table lock is already released, which may present a problem on architectures with weak memory model (PPC). To deal with this case, a new parameter is added to mm_tlb_flush_pending() to indicate if it is read without the page-table lock taken, and calling smp_mb__after_unlock_lock() in this case. Link: http://lkml.kernel.org/r/20170802000818.4760-3-namit@vmware.com Signed-off-by: Nadav Amit Acked-by: Rik van Riel Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Andy Lutomirski Cc: Mel Gorman Cc: "David S. Miller" Cc: Andrea Arcangeli Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Nadav Amit Cc: Russell King Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f58f76ee1dfa..0e478ebd2706 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -526,12 +526,12 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) /* * Memory barriers to keep this state in sync are graciously provided by * the page table locks, outside of which no page table modifications happen. - * The barriers below prevent the compiler from re-ordering the instructions - * around the memory barriers that are already present in the code. + * The barriers are used to ensure the order between tlb_flush_pending updates, + * which happen while the lock is not taken, and the PTE updates, which happen + * while the lock is taken, are serialized. */ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { - barrier(); return atomic_read(&mm->tlb_flush_pending) > 0; } @@ -554,7 +554,13 @@ static inline void inc_tlb_flush_pending(struct mm_struct *mm) /* Clearing is done after a TLB flush, which also provides a barrier. */ static inline void dec_tlb_flush_pending(struct mm_struct *mm) { - barrier(); + /* + * Guarantee that the tlb_flush_pending does not not leak into the + * critical section, since we must order the PTE change and changes to + * the pending TLB flush indication. We could have relied on TLB flush + * as a memory barrier, but this behavior is not clearly documented. + */ + smp_mb__before_atomic(); atomic_dec(&mm->tlb_flush_pending); } #else -- cgit v1.2.3-71-gd317 From 56236a59556cfd3bae7bffb7e5f438b5ef0af880 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 10 Aug 2017 15:24:05 -0700 Subject: mm: refactor TLB gathering API This patch is a preparatory patch for solving race problems caused by TLB batch. For that, we will increase/decrease TLB flush pending count of mm_struct whenever tlb_[gather|finish]_mmu is called. Before making it simple, this patch separates architecture specific part and rename it to arch_tlb_[gather|finish]_mmu and generic part just calls it. It shouldn't change any behavior. Link: http://lkml.kernel.org/r/20170802000818.4760-5-namit@vmware.com Signed-off-by: Minchan Kim Signed-off-by: Nadav Amit Acked-by: Mel Gorman Cc: Ingo Molnar Cc: Russell King Cc: Tony Luck Cc: Martin Schwidefsky Cc: "David S. Miller" Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Jeff Dike Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Hugh Dickins Cc: Mel Gorman Cc: Nadav Amit Cc: Rik van Riel Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 6 ++++-- arch/ia64/include/asm/tlb.h | 6 ++++-- arch/s390/include/asm/tlb.h | 12 ++++++------ arch/sh/include/asm/tlb.h | 6 ++++-- arch/um/include/asm/tlb.h | 8 +++++--- include/asm-generic/tlb.h | 7 ++++--- include/linux/mm_types.h | 6 ++++++ mm/memory.c | 28 +++++++++++++++++++++------- 8 files changed, 54 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 3f2eb76243e3..7f5b2a2d3861 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -148,7 +148,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) +arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) { tlb->mm = mm; tlb->fullmm = !(start | (end+1)); @@ -166,7 +167,8 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start } static inline void -tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +arch_tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end) { tlb_flush_mmu(tlb); diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index fced197b9626..93cadc04ac62 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -168,7 +168,8 @@ static inline void __tlb_alloc_page(struct mmu_gather *tlb) static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) +arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) { tlb->mm = mm; tlb->max = ARRAY_SIZE(tlb->local); @@ -185,7 +186,8 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start * collected. */ static inline void -tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +arch_tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end) { /* * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 7317b3108a88..d574d0820dc8 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -47,10 +47,9 @@ struct mmu_table_batch { extern void tlb_table_flush(struct mmu_gather *tlb); extern void tlb_remove_table(struct mmu_gather *tlb, void *table); -static inline void tlb_gather_mmu(struct mmu_gather *tlb, - struct mm_struct *mm, - unsigned long start, - unsigned long end) +static inline void +arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) { tlb->mm = mm; tlb->start = start; @@ -76,8 +75,9 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) tlb_flush_mmu_free(tlb); } -static inline void tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end) +static inline void +arch_tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end) { tlb_flush_mmu(tlb); } diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 46e0d635e36f..89786560dbd4 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -36,7 +36,8 @@ static inline void init_tlb_gather(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) +arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) { tlb->mm = mm; tlb->start = start; @@ -47,7 +48,8 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start } static inline void -tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +arch_tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end) { if (tlb->fullmm) flush_tlb_mm(tlb->mm); diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 600a2e9bfee2..2a901eca7145 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -45,7 +45,8 @@ static inline void init_tlb_gather(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) +arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) { tlb->mm = mm; tlb->start = start; @@ -80,12 +81,13 @@ tlb_flush_mmu(struct mmu_gather *tlb) tlb_flush_mmu_free(tlb); } -/* tlb_finish_mmu +/* arch_tlb_finish_mmu * Called at the end of the shootdown operation to free up any resources * that were required. */ static inline void -tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +arch_tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end) { tlb_flush_mmu(tlb); diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 8afa4335e5b2..8f71521e7a44 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -112,10 +112,11 @@ struct mmu_gather { #define HAVE_GENERIC_MMU_GATHER -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end); +void arch_tlb_gather_mmu(struct mmu_gather *tlb, + struct mm_struct *mm, unsigned long start, unsigned long end); void tlb_flush_mmu(struct mmu_gather *tlb); -void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, - unsigned long end); +void arch_tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end); extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e478ebd2706..c605f2a3a68e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -522,6 +522,12 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return mm->cpu_vm_mask_var; } +struct mmu_gather; +extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end); +extern void tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end); + #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) /* * Memory barriers to keep this state in sync are graciously provided by diff --git a/mm/memory.c b/mm/memory.c index f65beaad319b..34cba5113e06 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -215,12 +215,8 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } -/* tlb_gather_mmu - * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. The @fullmm argument is used when @mm is without - * users and we're going to destroy the full address space (exit/execve). - */ -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) +void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) { tlb->mm = mm; @@ -275,7 +271,8 @@ void tlb_flush_mmu(struct mmu_gather *tlb) * Called at the end of the shootdown operation to free up any resources * that were required. */ -void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +void arch_tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end) { struct mmu_gather_batch *batch, *next; @@ -398,6 +395,23 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ +/* tlb_gather_mmu + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. The @fullmm argument is used when @mm is without + * users and we're going to destroy the full address space (exit/execve). + */ +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + arch_tlb_gather_mmu(tlb, mm, start, end); +} + +void tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end) +{ + arch_tlb_finish_mmu(tlb, start, end); +} + /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. -- cgit v1.2.3-71-gd317 From 0a2dd266dd6b7a31503b5bbe63af05961a6b446d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 10 Aug 2017 15:24:09 -0700 Subject: mm: make tlb_flush_pending global Currently, tlb_flush_pending is used only for CONFIG_[NUMA_BALANCING| COMPACTION] but upcoming patches to solve subtle TLB flush batching problem will use it regardless of compaction/NUMA so this patch doesn't remove the dependency. [akpm@linux-foundation.org: remove more ifdefs from world's ugliest printk statement] Link: http://lkml.kernel.org/r/20170802000818.4760-6-namit@vmware.com Signed-off-by: Minchan Kim Signed-off-by: Nadav Amit Acked-by: Mel Gorman Cc: "David S. Miller" Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Nadav Amit Cc: Rik van Riel Cc: Russell King Cc: Sergey Senozhatsky Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 21 --------------------- mm/debug.c | 4 ---- 2 files changed, 25 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c605f2a3a68e..892a7b0196fd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -487,14 +487,12 @@ struct mm_struct { /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; #endif -#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) /* * An operation with batched TLB flushing is going on. Anything that * can move process memory needs to flush the TLB when moving a * PROT_NONE or PROT_NUMA mapped page. */ atomic_t tlb_flush_pending; -#endif #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ bool tlb_flush_batched; @@ -528,7 +526,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, extern void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end); -#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) /* * Memory barriers to keep this state in sync are graciously provided by * the page table locks, outside of which no page table modifications happen. @@ -569,24 +566,6 @@ static inline void dec_tlb_flush_pending(struct mm_struct *mm) smp_mb__before_atomic(); atomic_dec(&mm->tlb_flush_pending); } -#else -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) -{ - return false; -} - -static inline void init_tlb_flush_pending(struct mm_struct *mm) -{ -} - -static inline void inc_tlb_flush_pending(struct mm_struct *mm) -{ -} - -static inline void dec_tlb_flush_pending(struct mm_struct *mm) -{ -} -#endif struct vm_fault; diff --git a/mm/debug.c b/mm/debug.c index d70103bb4731..5715448ab0b5 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -124,9 +124,7 @@ void dump_mm(const struct mm_struct *mm) #ifdef CONFIG_NUMA_BALANCING "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" #endif -#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) "tlb_flush_pending %d\n" -#endif "def_flags: %#lx(%pGv)\n", mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, @@ -158,9 +156,7 @@ void dump_mm(const struct mm_struct *mm) #ifdef CONFIG_NUMA_BALANCING mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, #endif -#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) atomic_read(&mm->tlb_flush_pending), -#endif mm->def_flags, &mm->def_flags ); } -- cgit v1.2.3-71-gd317 From 99baac21e4585f4258f919502c6e23f1e5edc98c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 10 Aug 2017 15:24:12 -0700 Subject: mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem Nadav reported parallel MADV_DONTNEED on same range has a stale TLB problem and Mel fixed it[1] and found same problem on MADV_FREE[2]. Quote from Mel Gorman: "The race in question is CPU 0 running madv_free and updating some PTEs while CPU 1 is also running madv_free and looking at the same PTEs. CPU 1 may have writable TLB entries for a page but fail the pte_dirty check (because CPU 0 has updated it already) and potentially fail to flush. Hence, when madv_free on CPU 1 returns, there are still potentially writable TLB entries and the underlying PTE is still present so that a subsequent write does not necessarily propagate the dirty bit to the underlying PTE any more. Reclaim at some unknown time at the future may then see that the PTE is still clean and discard the page even though a write has happened in the meantime. I think this is possible but I could have missed some protection in madv_free that prevents it happening." This patch aims for solving both problems all at once and is ready for other problem with KSM, MADV_FREE and soft-dirty story[3]. TLB batch API(tlb_[gather|finish]_mmu] uses [inc|dec]_tlb_flush_pending and mmu_tlb_flush_pending so that when tlb_finish_mmu is called, we can catch there are parallel threads going on. In that case, forcefully, flush TLB to prevent for user to access memory via stale TLB entry although it fail to gather page table entry. I confirmed this patch works with [4] test program Nadav gave so this patch supersedes "mm: Always flush VMA ranges affected by zap_page_range v2" in current mmotm. NOTE: This patch modifies arch-specific TLB gathering interface(x86, ia64, s390, sh, um). It seems most of architecture are straightforward but s390 need to be careful because tlb_flush_mmu works only if mm->context.flush_mm is set to non-zero which happens only a pte entry really is cleared by ptep_get_and_clear and friends. However, this problem never changes the pte entries but need to flush to prevent memory access from stale tlb. [1] http://lkml.kernel.org/r/20170725101230.5v7gvnjmcnkzzql3@techsingularity.net [2] http://lkml.kernel.org/r/20170725100722.2dxnmgypmwnrfawp@suse.de [3] http://lkml.kernel.org/r/BD3A0EBE-ECF4-41D4-87FA-C755EA9AB6BD@gmail.com [4] https://patchwork.kernel.org/patch/9861621/ [minchan@kernel.org: decrease tlb flush pending count in tlb_finish_mmu] Link: http://lkml.kernel.org/r/20170808080821.GA31730@bbox Link: http://lkml.kernel.org/r/20170802000818.4760-7-namit@vmware.com Signed-off-by: Minchan Kim Signed-off-by: Nadav Amit Reported-by: Nadav Amit Reported-by: Mel Gorman Acked-by: Mel Gorman Cc: Ingo Molnar Cc: Russell King Cc: Tony Luck Cc: Martin Schwidefsky Cc: "David S. Miller" Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Jeff Dike Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Hugh Dickins Cc: Mel Gorman Cc: Nadav Amit Cc: Rik van Riel Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 7 ++++++- arch/ia64/include/asm/tlb.h | 4 +++- arch/s390/include/asm/tlb.h | 7 ++++++- arch/sh/include/asm/tlb.h | 4 ++-- arch/um/include/asm/tlb.h | 7 ++++++- include/asm-generic/tlb.h | 2 +- include/linux/mm_types.h | 8 ++++++++ mm/memory.c | 18 ++++++++++++++++-- 8 files changed, 48 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 7f5b2a2d3861..d5562f9ce600 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -168,8 +168,13 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, static inline void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool force) { + if (force) { + tlb->range_start = start; + tlb->range_end = end; + } + tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 93cadc04ac62..cbe5ac3699bf 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -187,8 +187,10 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, */ static inline void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool force) { + if (force) + tlb->need_flush = 1; /* * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and * tlb->end_addr. diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index d574d0820dc8..2eb8ff0d6fca 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -77,8 +77,13 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) static inline void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool force) { + if (force) { + tlb->start = start; + tlb->end = end; + } + tlb_flush_mmu(tlb); } diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 89786560dbd4..51a8bc967e75 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -49,9 +49,9 @@ arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, static inline void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool force) { - if (tlb->fullmm) + if (tlb->fullmm || force) flush_tlb_mm(tlb->mm); /* keep the page table cache within bounds */ diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 2a901eca7145..344d95619d03 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -87,8 +87,13 @@ tlb_flush_mmu(struct mmu_gather *tlb) */ static inline void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool force) { + if (force) { + tlb->start = start; + tlb->end = end; + tlb->need_flush = 1; + } tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 8f71521e7a44..faddde44de8c 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -116,7 +116,7 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end); void tlb_flush_mmu(struct mmu_gather *tlb); void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, bool force); extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 892a7b0196fd..3cadee0a3508 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -538,6 +538,14 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending) > 0; } +/* + * Returns true if there are two above TLB batching threads in parallel. + */ +static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +{ + return atomic_read(&mm->tlb_flush_pending) > 1; +} + static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); diff --git a/mm/memory.c b/mm/memory.c index 34cba5113e06..e158f7ac6730 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -272,10 +272,13 @@ void tlb_flush_mmu(struct mmu_gather *tlb) * that were required. */ void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool force) { struct mmu_gather_batch *batch, *next; + if (force) + __tlb_adjust_range(tlb, start, end - start); + tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ @@ -404,12 +407,23 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { arch_tlb_gather_mmu(tlb, mm, start, end); + inc_tlb_flush_pending(tlb->mm); } void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - arch_tlb_finish_mmu(tlb, start, end); + /* + * If there are parallel threads are doing PTE changes on same range + * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB + * flush by batching, a thread has stable TLB entry can fail to flush + * the TLB by observing pte_none|!pte_dirty, for example so flush TLB + * forcefully if we detect parallel PTE batching threads. + */ + bool force = mm_tlb_flush_nested(tlb->mm); + + arch_tlb_finish_mmu(tlb, start, end, force); + dec_tlb_flush_pending(tlb->mm); } /* -- cgit v1.2.3-71-gd317 From ad729bc9acfb7c47112964b4877ef5404578ed13 Mon Sep 17 00:00:00 2001 From: Andreas Born Date: Thu, 10 Aug 2017 06:41:44 +0200 Subject: bonding: require speed/duplex only for 802.3ad, alb and tlb The patch c4adfc822bf5 ("bonding: make speed, duplex setting consistent with link state") puts the link state to down if bond_update_speed_duplex() cannot retrieve speed and duplex settings. Assumably the patch was written with 802.3ad mode in mind which relies on link speed/duplex settings. For other modes like active-backup these settings are not required. Thus, only for these other modes, this patch reintroduces support for slaves that do not support reporting speed or duplex such as wireless devices. This fixes the regression reported in bug 196547 (https://bugzilla.kernel.org/show_bug.cgi?id=196547). Fixes: c4adfc822bf5 ("bonding: make speed, duplex setting consistent with link state") Signed-off-by: Andreas Born Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 6 ++++-- include/net/bonding.h | 5 +++++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 9bee6c1c70cc..85bb272d2a34 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1569,7 +1569,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) new_slave->delay = 0; new_slave->link_failure_count = 0; - if (bond_update_speed_duplex(new_slave)) + if (bond_update_speed_duplex(new_slave) && + bond_needs_speed_duplex(bond)) new_slave->link = BOND_LINK_DOWN; new_slave->last_rx = jiffies - @@ -2140,7 +2141,8 @@ static void bond_miimon_commit(struct bonding *bond) continue; case BOND_LINK_UP: - if (bond_update_speed_duplex(slave)) { + if (bond_update_speed_duplex(slave) && + bond_needs_speed_duplex(bond)) { slave->link = BOND_LINK_DOWN; netdev_warn(bond->dev, "failed to get link speed/duplex for %s\n", diff --git a/include/net/bonding.h b/include/net/bonding.h index b00508d22e0a..b2e68657a216 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -277,6 +277,11 @@ static inline bool bond_is_lb(const struct bonding *bond) BOND_MODE(bond) == BOND_MODE_ALB; } +static inline bool bond_needs_speed_duplex(const struct bonding *bond) +{ + return BOND_MODE(bond) == BOND_MODE_8023AD || bond_is_lb(bond); +} + static inline bool bond_is_nondyn_tlb(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_TLB) && -- cgit v1.2.3-71-gd317 From e4dde4127396f0c8f1c2e11b3ecc5baf4f8628bf Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 11 Aug 2017 18:31:24 +0200 Subject: net: fix compilation when busy poll is not enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIN_NAPI_ID is used in various places outside of CONFIG_NET_RX_BUSY_POLL wrapping, so when it's not set we run into build errors such as: net/core/dev.c: In function 'dev_get_by_napi_id': net/core/dev.c:886:16: error: ‘MIN_NAPI_ID’ undeclared (first use in this function) if (napi_id < MIN_NAPI_ID) ^~~~~~~~~~~ Thus, have MIN_NAPI_ID always defined to fix these errors. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/busy_poll.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 8ffd434676b7..71c72a939bf8 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -29,18 +29,18 @@ #include #include -#ifdef CONFIG_NET_RX_BUSY_POLL - -struct napi_struct; -extern unsigned int sysctl_net_busy_read __read_mostly; -extern unsigned int sysctl_net_busy_poll __read_mostly; - /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu * NR_CPUS+1..~0 - Region available for NAPI IDs */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +#ifdef CONFIG_NET_RX_BUSY_POLL + +struct napi_struct; +extern unsigned int sysctl_net_busy_read __read_mostly; +extern unsigned int sysctl_net_busy_poll __read_mostly; + static inline bool net_busy_loop_on(void) { return sysctl_net_busy_poll; -- cgit v1.2.3-71-gd317 From fd851ba9caa9a63fdbb72a2e6ed5560c0989e999 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Aug 2017 10:48:53 -0700 Subject: udp: harden copy_linear_skb() syzkaller got crashes with CONFIG_HARDENED_USERCOPY=y configs. Issue here is that recvfrom() can be used with user buffer of Z bytes, and SO_PEEK_OFF of X bytes, from a skb with Y bytes, and following condition : Z < X < Y kernel BUG at mm/usercopy.c:72! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 2917 Comm: syzkaller842281 Not tainted 4.13.0-rc3+ #16 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d2fa40c0 task.stack: ffff8801d1fe8000 RIP: 0010:report_usercopy mm/usercopy.c:64 [inline] RIP: 0010:__check_object_size+0x3ad/0x500 mm/usercopy.c:264 RSP: 0018:ffff8801d1fef8a8 EFLAGS: 00010286 RAX: 0000000000000078 RBX: ffffffff847102c0 RCX: 0000000000000000 RDX: 0000000000000078 RSI: 1ffff1003a3fded5 RDI: ffffed003a3fdf09 RBP: ffff8801d1fef998 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801d1ea480e R13: fffffffffffffffa R14: ffffffff84710280 R15: dffffc0000000000 FS: 0000000001360880(0000) GS:ffff8801dc000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000202ecfe4 CR3: 00000001d1ff8000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: check_object_size include/linux/thread_info.h:108 [inline] check_copy_size include/linux/thread_info.h:139 [inline] copy_to_iter include/linux/uio.h:105 [inline] copy_linear_skb include/net/udp.h:371 [inline] udpv6_recvmsg+0x1040/0x1af0 net/ipv6/udp.c:395 inet_recvmsg+0x14c/0x5f0 net/ipv4/af_inet.c:793 sock_recvmsg_nosec net/socket.c:792 [inline] sock_recvmsg+0xc9/0x110 net/socket.c:799 SYSC_recvfrom+0x2d6/0x570 net/socket.c:1788 SyS_recvfrom+0x40/0x50 net/socket.c:1760 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: b65ac44674dd ("udp: try to avoid 2 cache miss on dequeue") Signed-off-by: Eric Dumazet Cc: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index cc8036987dcb..e9b1d1eacb59 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -368,6 +368,8 @@ static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, { int n, copy = len - off; + if (copy < 0) + return -EINVAL; n = copy_to_iter(skb->data + off, copy, to); if (n == copy) return 0; -- cgit v1.2.3-71-gd317 From a99b646afa8a02571ea298bedca6592d818229cd Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Tue, 15 Aug 2017 11:23:23 +0800 Subject: PCI: Disable PCIe Relaxed Ordering if unsupported When bit4 is set in the PCIe Device Control register, it indicates whether the device is permitted to use relaxed ordering. On some platforms using relaxed ordering can have performance issues or due to erratum can cause data-corruption. In such cases devices must avoid using relaxed ordering. The patch adds a new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING to indicate that Relaxed Ordering (RO) attribute should not be used for Transaction Layer Packets (TLP) targeted towards these affected root complexes. This patch checks if there is any node in the hierarchy that indicates that using relaxed ordering is not safe. In such cases the patch turns off the relaxed ordering by clearing the capability for this device. Signed-off-by: Casey Leedom Signed-off-by: Ding Tianhong Acked-by: Ashok Raj Acked-by: Alexander Duyck Acked-by: Casey Leedom Signed-off-by: David S. Miller --- drivers/pci/probe.c | 43 +++++++++++++++++++++++++++++++++++++++++++ drivers/pci/quirks.c | 11 +++++++++++ include/linux/pci.h | 3 +++ 3 files changed, 57 insertions(+) (limited to 'include') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index c31310db0404..e6a917b4acd3 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1762,6 +1762,48 @@ static void pci_configure_extended_tags(struct pci_dev *dev) PCI_EXP_DEVCTL_EXT_TAG); } +/** + * pcie_relaxed_ordering_enabled - Probe for PCIe relaxed ordering enable + * @dev: PCI device to query + * + * Returns true if the device has enabled relaxed ordering attribute. + */ +bool pcie_relaxed_ordering_enabled(struct pci_dev *dev) +{ + u16 v; + + pcie_capability_read_word(dev, PCI_EXP_DEVCTL, &v); + + return !!(v & PCI_EXP_DEVCTL_RELAX_EN); +} +EXPORT_SYMBOL(pcie_relaxed_ordering_enabled); + +static void pci_configure_relaxed_ordering(struct pci_dev *dev) +{ + struct pci_dev *root; + + /* PCI_EXP_DEVICE_RELAX_EN is RsvdP in VFs */ + if (dev->is_virtfn) + return; + + if (!pcie_relaxed_ordering_enabled(dev)) + return; + + /* + * For now, we only deal with Relaxed Ordering issues with Root + * Ports. Peer-to-Peer DMA is another can of worms. + */ + root = pci_find_pcie_root_port(dev); + if (!root) + return; + + if (root->dev_flags & PCI_DEV_FLAGS_NO_RELAXED_ORDERING) { + pcie_capability_clear_word(dev, PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_RELAX_EN); + dev_info(&dev->dev, "Disable Relaxed Ordering because the Root Port didn't support it\n"); + } +} + static void pci_configure_device(struct pci_dev *dev) { struct hotplug_params hpp; @@ -1769,6 +1811,7 @@ static void pci_configure_device(struct pci_dev *dev) pci_configure_mps(dev); pci_configure_extended_tags(dev); + pci_configure_relaxed_ordering(dev); memset(&hpp, 0, sizeof(hpp)); ret = pci_get_hp_params(dev, &hpp); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6967c6b4cf6b..61b59bfa7bfc 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4015,6 +4015,17 @@ DECLARE_PCI_FIXUP_CLASS_EARLY(0x1797, 0x6868, PCI_CLASS_NOT_DEFINED, 8, DECLARE_PCI_FIXUP_CLASS_EARLY(0x1797, 0x6869, PCI_CLASS_NOT_DEFINED, 8, quirk_tw686x_class); +/* + * Some devices have problems with Transaction Layer Packets with the Relaxed + * Ordering Attribute set. Such devices should mark themselves and other + * Device Drivers should check before sending TLPs with RO set. + */ +static void quirk_relaxedordering_disable(struct pci_dev *dev) +{ + dev->dev_flags |= PCI_DEV_FLAGS_NO_RELAXED_ORDERING; + dev_info(&dev->dev, "Disable Relaxed Ordering Attributes to avoid PCIe Completion erratum\n"); +} + /* * Per PCIe r3.0, sec 2.2.9, "Completion headers must supply the same * values for the Attribute as were supplied in the header of the diff --git a/include/linux/pci.h b/include/linux/pci.h index 4869e66dd659..29606fb89464 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -188,6 +188,8 @@ enum pci_dev_flags { * the direct_complete optimization. */ PCI_DEV_FLAGS_NEEDS_RESUME = (__force pci_dev_flags_t) (1 << 11), + /* Don't use Relaxed Ordering for TLPs directed at this device */ + PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 12), }; enum pci_irq_reroute_variant { @@ -1125,6 +1127,7 @@ bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); void pci_d3cold_enable(struct pci_dev *dev); void pci_d3cold_disable(struct pci_dev *dev); +bool pcie_relaxed_ordering_enabled(struct pci_dev *dev); /* PCI Virtual Channel */ int pci_save_vc_state(struct pci_dev *dev); -- cgit v1.2.3-71-gd317 From 42b7305905be52e467bbc346b0f2f95ad44eb1a0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 14 Aug 2017 21:31:38 +0200 Subject: udp: fix linear skb reception with PEEK_OFF copy_linear_skb() is broken; both of its callers actually expect 'len' to be the amount we are trying to copy, not the offset of the end. Fix it keeping the meanings of arguments in sync with what the callers (both of them) expect. Also restore a saner behavior on EFAULT (i.e. preserving the iov_iter position in case of failure): The commit fd851ba9caa9 ("udp: harden copy_linear_skb()") avoids the more destructive effect of the buggy copy_linear_skb(), e.g. no more invalid memory access, but said function still behaves incorrectly: when peeking with offset it can fail with EINVAL instead of copying the appropriate amount of memory. Reported-by: Sasha Levin Fixes: b65ac44674dd ("udp: try to avoid 2 cache miss on dequeue") Fixes: fd851ba9caa9 ("udp: harden copy_linear_skb()") Signed-off-by: Al Viro Acked-by: Paolo Abeni Tested-by: Sasha Levin Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/udp.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index e9b1d1eacb59..586de4b811b5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -366,14 +366,13 @@ static inline bool udp_skb_is_linear(struct sk_buff *skb) static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { - int n, copy = len - off; + int n; - if (copy < 0) - return -EINVAL; - n = copy_to_iter(skb->data + off, copy, to); - if (n == copy) + n = copy_to_iter(skb->data + off, len, to); + if (n == len) return 0; + iov_iter_revert(to, n); return -EFAULT; } -- cgit v1.2.3-71-gd317 From 2926a2aa5c14fb2add75e6584845b1c03022235f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Aug 2017 17:19:26 +0200 Subject: iommu: Fix wrong freeing of iommu_device->dev The struct iommu_device has a 'struct device' embedded into it, not as a pointer, but the whole struct. In the conversion of the iommu drivers to use struct iommu_device it was forgotten that the relase function for that struct device simply calls kfree() on the pointer. This frees memory that was never allocated and causes memory corruption. To fix this issue, use a pointer to struct device instead of embedding the whole struct. This needs some updates in the iommu sysfs code as well as the Intel VT-d and AMD IOMMU driver. Reported-by: Sebastian Ott Fixes: 39ab9555c241 ('iommu: Add sysfs bindings for struct iommu_device') Cc: stable@vger.kernel.org # >= v4.11 Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_types.h | 4 +++- drivers/iommu/intel-iommu.c | 4 +++- drivers/iommu/iommu-sysfs.c | 32 ++++++++++++++++++++------------ include/linux/iommu.h | 12 +++++++++++- 4 files changed, 37 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 294a409e283b..d6b873b57054 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -574,7 +574,9 @@ struct amd_iommu { static inline struct amd_iommu *dev_to_amd_iommu(struct device *dev) { - return container_of(dev, struct amd_iommu, iommu.dev); + struct iommu_device *iommu = dev_to_iommu_device(dev); + + return container_of(iommu, struct amd_iommu, iommu); } #define ACPIHID_UID_LEN 256 diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 687f18f65cea..3e8636f1220e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -4736,7 +4736,9 @@ static void intel_disable_iommus(void) static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) { - return container_of(dev, struct intel_iommu, iommu.dev); + struct iommu_device *iommu_dev = dev_to_iommu_device(dev); + + return container_of(iommu_dev, struct intel_iommu, iommu); } static ssize_t intel_iommu_show_version(struct device *dev, diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c index c58351ed61c1..36d1a7ce7fc4 100644 --- a/drivers/iommu/iommu-sysfs.c +++ b/drivers/iommu/iommu-sysfs.c @@ -62,32 +62,40 @@ int iommu_device_sysfs_add(struct iommu_device *iommu, va_list vargs; int ret; - device_initialize(&iommu->dev); + iommu->dev = kzalloc(sizeof(*iommu->dev), GFP_KERNEL); + if (!iommu->dev) + return -ENOMEM; - iommu->dev.class = &iommu_class; - iommu->dev.parent = parent; - iommu->dev.groups = groups; + device_initialize(iommu->dev); + + iommu->dev->class = &iommu_class; + iommu->dev->parent = parent; + iommu->dev->groups = groups; va_start(vargs, fmt); - ret = kobject_set_name_vargs(&iommu->dev.kobj, fmt, vargs); + ret = kobject_set_name_vargs(&iommu->dev->kobj, fmt, vargs); va_end(vargs); if (ret) goto error; - ret = device_add(&iommu->dev); + ret = device_add(iommu->dev); if (ret) goto error; + dev_set_drvdata(iommu->dev, iommu); + return 0; error: - put_device(&iommu->dev); + put_device(iommu->dev); return ret; } void iommu_device_sysfs_remove(struct iommu_device *iommu) { - device_unregister(&iommu->dev); + dev_set_drvdata(iommu->dev, NULL); + device_unregister(iommu->dev); + iommu->dev = NULL; } /* * IOMMU drivers can indicate a device is managed by a given IOMMU using @@ -102,14 +110,14 @@ int iommu_device_link(struct iommu_device *iommu, struct device *link) if (!iommu || IS_ERR(iommu)) return -ENODEV; - ret = sysfs_add_link_to_group(&iommu->dev.kobj, "devices", + ret = sysfs_add_link_to_group(&iommu->dev->kobj, "devices", &link->kobj, dev_name(link)); if (ret) return ret; - ret = sysfs_create_link_nowarn(&link->kobj, &iommu->dev.kobj, "iommu"); + ret = sysfs_create_link_nowarn(&link->kobj, &iommu->dev->kobj, "iommu"); if (ret) - sysfs_remove_link_from_group(&iommu->dev.kobj, "devices", + sysfs_remove_link_from_group(&iommu->dev->kobj, "devices", dev_name(link)); return ret; @@ -121,5 +129,5 @@ void iommu_device_unlink(struct iommu_device *iommu, struct device *link) return; sysfs_remove_link(&link->kobj, "iommu"); - sysfs_remove_link_from_group(&iommu->dev.kobj, "devices", dev_name(link)); + sysfs_remove_link_from_group(&iommu->dev->kobj, "devices", dev_name(link)); } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 2cb54adc4a33..176f7569d874 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -240,7 +240,7 @@ struct iommu_device { struct list_head list; const struct iommu_ops *ops; struct fwnode_handle *fwnode; - struct device dev; + struct device *dev; }; int iommu_device_register(struct iommu_device *iommu); @@ -265,6 +265,11 @@ static inline void iommu_device_set_fwnode(struct iommu_device *iommu, iommu->fwnode = fwnode; } +static inline struct iommu_device *dev_to_iommu_device(struct device *dev) +{ + return (struct iommu_device *)dev_get_drvdata(dev); +} + #define IOMMU_GROUP_NOTIFY_ADD_DEVICE 1 /* Device added */ #define IOMMU_GROUP_NOTIFY_DEL_DEVICE 2 /* Pre Device removed */ #define IOMMU_GROUP_NOTIFY_BIND_DRIVER 3 /* Pre Driver bind */ @@ -589,6 +594,11 @@ static inline void iommu_device_set_fwnode(struct iommu_device *iommu, { } +static inline struct iommu_device *dev_to_iommu_device(struct device *dev) +{ + return NULL; +} + static inline void iommu_device_unregister(struct iommu_device *iommu) { } -- cgit v1.2.3-71-gd317 From 12d94a804946af291e24b80fc53ec86264765781 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Aug 2017 04:09:51 -0700 Subject: ipv6: fix NULL dereference in ip6_route_dev_notify() Based on a syzkaller report [1], I found that a per cpu allocation failure in snmp6_alloc_dev() would then lead to NULL dereference in ip6_route_dev_notify(). It seems this is a very old bug, thus no Fixes tag in this submission. Let's add in6_dev_put_clear() helper, as we will probably use it elsewhere (once available/present in net-next) [1] kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 17294 Comm: syz-executor6 Not tainted 4.13.0-rc2+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff88019f456680 task.stack: ffff8801c6e58000 RIP: 0010:__read_once_size include/linux/compiler.h:250 [inline] RIP: 0010:atomic_read arch/x86/include/asm/atomic.h:26 [inline] RIP: 0010:refcount_sub_and_test+0x7d/0x1b0 lib/refcount.c:178 RSP: 0018:ffff8801c6e5f1b0 EFLAGS: 00010202 RAX: 0000000000000037 RBX: dffffc0000000000 RCX: ffffc90005d25000 RDX: ffff8801c6e5f218 RSI: ffffffff82342bbf RDI: 0000000000000001 RBP: ffff8801c6e5f240 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff10038dcbe37 R13: 0000000000000006 R14: 0000000000000001 R15: 00000000000001b8 FS: 00007f21e0429700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001ddbc22000 CR3: 00000001d632b000 CR4: 00000000001426e0 DR0: 0000000020000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600 Call Trace: refcount_dec_and_test+0x1a/0x20 lib/refcount.c:211 in6_dev_put include/net/addrconf.h:335 [inline] ip6_route_dev_notify+0x1c9/0x4a0 net/ipv6/route.c:3732 notifier_call_chain+0x136/0x2c0 kernel/notifier.c:93 __raw_notifier_call_chain kernel/notifier.c:394 [inline] raw_notifier_call_chain+0x2d/0x40 kernel/notifier.c:401 call_netdevice_notifiers_info+0x51/0x90 net/core/dev.c:1678 call_netdevice_notifiers net/core/dev.c:1694 [inline] rollback_registered_many+0x91c/0xe80 net/core/dev.c:7107 rollback_registered+0x1be/0x3c0 net/core/dev.c:7149 register_netdevice+0xbcd/0xee0 net/core/dev.c:7587 register_netdev+0x1a/0x30 net/core/dev.c:7669 loopback_net_init+0x76/0x160 drivers/net/loopback.c:214 ops_init+0x10a/0x570 net/core/net_namespace.c:118 setup_net+0x313/0x710 net/core/net_namespace.c:294 copy_net_ns+0x27c/0x580 net/core/net_namespace.c:418 create_new_namespaces+0x425/0x880 kernel/nsproxy.c:107 unshare_nsproxy_namespaces+0xae/0x1e0 kernel/nsproxy.c:206 SYSC_unshare kernel/fork.c:2347 [inline] SyS_unshare+0x653/0xfa0 kernel/fork.c:2297 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512c9 RSP: 002b:00007f21e0428c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000110 RAX: ffffffffffffffda RBX: 0000000000718150 RCX: 00000000004512c9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000062020200 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b973d R13: 00000000ffffffff R14: 000000002001d000 R15: 00000000000002dd Code: 50 2b 34 82 c7 00 f1 f1 f1 f1 c7 40 04 04 f2 f2 f2 c7 40 08 f3 f3 f3 f3 e8 a1 43 39 ff 4c 89 f8 48 8b 95 70 ff ff ff 48 c1 e8 03 <0f> b6 0c 18 4c 89 f8 83 e0 07 83 c0 03 38 c8 7c 08 84 c9 0f 85 RIP: __read_once_size include/linux/compiler.h:250 [inline] RSP: ffff8801c6e5f1b0 RIP: atomic_read arch/x86/include/asm/atomic.h:26 [inline] RSP: ffff8801c6e5f1b0 RIP: refcount_sub_and_test+0x7d/0x1b0 lib/refcount.c:178 RSP: ffff8801c6e5f1b0 ---[ end trace e441d046c6410d31 ]--- Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- include/net/addrconf.h | 10 ++++++++++ net/ipv6/route.c | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 6df79e96a780..f44ff2476758 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -336,6 +336,16 @@ static inline void in6_dev_put(struct inet6_dev *idev) in6_dev_finish_destroy(idev); } +static inline void in6_dev_put_clear(struct inet6_dev **pidev) +{ + struct inet6_dev *idev = *pidev; + + if (idev) { + in6_dev_put(idev); + *pidev = NULL; + } +} + static inline void __in6_dev_put(struct inet6_dev *idev) { refcount_dec(&idev->refcnt); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 99d4727f2b18..94d6a13d47f0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3721,10 +3721,10 @@ static int ip6_route_dev_notify(struct notifier_block *this, /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ - in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES - in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev); - in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); #endif } -- cgit v1.2.3-71-gd317 From b3dc8f772fab5b2d284b780830fd56494491e493 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Tue, 15 Aug 2017 04:28:54 -0700 Subject: net: Fix a typo in comment about sock flags. Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- include/linux/net.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index dda2cc939a53..ebeb48c92005 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -37,7 +37,7 @@ struct net; /* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. - * Eventually all flags will be in sk->sk_wq_flags. + * Eventually all flags will be in sk->sk_wq->flags. */ #define SOCKWQ_ASYNC_NOSPACE 0 #define SOCKWQ_ASYNC_WAITDATA 1 -- cgit v1.2.3-71-gd317 From 81fbfe8adaf38d4f5a98c19bebfd41c5d6acaee8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 10:36:47 -0700 Subject: ptr_ring: use kmalloc_array() As found by syzkaller, malicious users can set whatever tx_queue_len on a tun device and eventually crash the kernel. Lets remove the ALIGN(XXX, SMP_CACHE_BYTES) thing since a small ring buffer is not fast anyway. Fixes: 2e0ab8ca83c1 ("ptr_ring: array based FIFO for pointers") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Michael S. Tsirkin Cc: Jason Wang Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 9 +++++---- include/linux/skb_array.h | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index d8c97ec8a8e6..37b4bb2545b3 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -436,9 +436,9 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, __PTR_RING_PEEK_CALL_v; \ }) -static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp) +static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp) { - return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp); + return kcalloc(size, sizeof(void *), gfp); } static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) @@ -582,7 +582,8 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ -static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings, +static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, + unsigned int nrings, int size, gfp_t gfp, void (*destroy)(void *)) { @@ -590,7 +591,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings, void ***queues; int i; - queues = kmalloc(nrings * sizeof *queues, gfp); + queues = kmalloc_array(nrings, sizeof(*queues), gfp); if (!queues) goto noqueues; diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h index 35226cd4efb0..8621ffdeecbf 100644 --- a/include/linux/skb_array.h +++ b/include/linux/skb_array.h @@ -193,7 +193,8 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) } static inline int skb_array_resize_multiple(struct skb_array **rings, - int nrings, int size, gfp_t gfp) + int nrings, unsigned int size, + gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); return ptr_ring_resize_multiple((struct ptr_ring **)rings, -- cgit v1.2.3-71-gd317 From c780a049f9bf442314335372c9abc4548bfe3e44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 11:09:12 -0700 Subject: ipv4: better IP_MAX_MTU enforcement While working on yet another syzkaller report, I found that our IP_MAX_MTU enforcements were not properly done. gcc seems to reload dev->mtu for min(dev->mtu, IP_MAX_MTU), and final result can be bigger than IP_MAX_MTU :/ This is a problem because device mtu can be changed on other cpus or threads. While this patch does not fix the issue I am working on, it is probably worth addressing it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 4 ++-- net/ipv4/route.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 821cedcc8e73..0cf7f5a65fe6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -352,7 +352,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, !forwarding) return dst_mtu(dst); - return min(dst->dev->mtu, IP_MAX_MTU); + return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU); } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, @@ -364,7 +364,7 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk, return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); } - return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU); + return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); } u32 ip_idents_reserve(u32 hash, int segs); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7effa62beed3..fe877a4a72b1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1267,7 +1267,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) if (mtu) return mtu; - mtu = dst->dev->mtu; + mtu = READ_ONCE(dst->dev->mtu); if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { if (rt->rt_uses_gateway && mtu > 576) -- cgit v1.2.3-71-gd317 From 70e42fd02c46e2aa9ab07b766d418637e3a51de7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 9 Aug 2017 12:00:22 +0900 Subject: scsi: sd_zbc: Write unlock zone from sd_uninit_cmnd() Releasing a zone write lock only when the write commnand that acquired the lock completes can cause deadlocks due to potential command reordering if the lock owning request is requeued and not executed. This problem exists only with the scsi-mq path as, unlike the legacy path, requests are moved out of the dispatch queue before being prepared and so before locking a zone for a write command. Since sd_uninit_cmnd() is now always called when a request is requeued, call sd_zbc_write_unlock_zone() from that function for write requests that acquired a zone lock instead of from sd_done(). Acquisition of a zone lock by a write command is indicated using the new command flag SCMD_ZONE_WRITE_LOCK. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 3 +++ drivers/scsi/sd_zbc.c | 9 +++++---- include/scsi/scsi_cmnd.h | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index bea36adeee17..e2647f2d4430 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1277,6 +1277,9 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt) { struct request *rq = SCpnt->request; + if (SCpnt->flags & SCMD_ZONE_WRITE_LOCK) + sd_zbc_write_unlock_zone(SCpnt); + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) __free_page(rq->special_vec.bv_page); diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 96855df9f49d..8aa54779aac1 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -294,6 +294,9 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd) test_and_set_bit(zno, sdkp->zones_wlock)) return BLKPREP_DEFER; + WARN_ON_ONCE(cmd->flags & SCMD_ZONE_WRITE_LOCK); + cmd->flags |= SCMD_ZONE_WRITE_LOCK; + return BLKPREP_OK; } @@ -302,9 +305,10 @@ void sd_zbc_write_unlock_zone(struct scsi_cmnd *cmd) struct request *rq = cmd->request; struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); - if (sdkp->zones_wlock) { + if (sdkp->zones_wlock && cmd->flags & SCMD_ZONE_WRITE_LOCK) { unsigned int zno = sd_zbc_zone_no(sdkp, blk_rq_pos(rq)); WARN_ON_ONCE(!test_bit(zno, sdkp->zones_wlock)); + cmd->flags &= ~SCMD_ZONE_WRITE_LOCK; clear_bit_unlock(zno, sdkp->zones_wlock); smp_mb__after_atomic(); } @@ -335,9 +339,6 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, case REQ_OP_WRITE_ZEROES: case REQ_OP_WRITE_SAME: - /* Unlock the zone */ - sd_zbc_write_unlock_zone(cmd); - if (result && sshdr->sense_key == ILLEGAL_REQUEST && sshdr->asc == 0x21) diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index a1266d318c85..6af198d8120b 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -57,6 +57,7 @@ struct scsi_pointer { /* for scmd->flags */ #define SCMD_TAGGED (1 << 0) #define SCMD_UNCHECKED_ISA_DMA (1 << 1) +#define SCMD_ZONE_WRITE_LOCK (1 << 2) struct scsi_cmnd { struct scsi_request req; -- cgit v1.2.3-71-gd317 From c8c03f1858331e85d397bacccd34ef409aae993c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 16 Aug 2017 17:08:07 -0700 Subject: pty: fix the cached path of the pty slave file descriptor in the master Christian Brauner reported that if you use the TIOCGPTPEER ioctl() to get a slave pty file descriptor, the resulting file descriptor doesn't look right in /proc//fd/. In particular, he wanted to use readlink() on /proc/self/fd/ to get the pathname of the slave pty (basically implementing "ptsname{_r}()"). The reason for that was that we had generated the wrong 'struct path' when we create the pty in ptmx_open(). In particular, the dentry was correct, but the vfsmount pointed to the mount of the ptmx node. That _can_ be correct - in case you use "/dev/pts/ptmx" to open the master - but usually is not. The normal case is to use /dev/ptmx, which then looks up the pts/ directory, and then the vfsmount of the ptmx node is obviously the /dev directory, not the /dev/pts/ directory. We actually did have the right vfsmount available, but in the wrong place (it gets looked up in 'devpts_acquire()' when we get a reference to the pts filesystem), and so ptmx_open() used the wrong mnt pointer. The end result of this confusion was that the pty worked fine, but when if you did TIOCGPTPEER to get the slave side of the pty, end end result would also work, but have that dodgy 'struct path'. And then when doing "d_path()" on to get the pathname, the vfsmount would not match the root of the pts directory, and d_path() would return an empty pathname thinking that the entry had escaped a bind mount into another mount. This fixes the problem by making devpts_acquire() return the vfsmount for the pts filesystem, allowing ptmx_open() to trivially just use the right mount for the pts dentry, and create the proper 'struct path'. Reported-by: Christian Brauner Cc: Al Viro Acked-by: Eric Biederman Signed-off-by: Linus Torvalds --- drivers/tty/pty.c | 7 +++++-- fs/devpts/inode.c | 4 +++- include/linux/devpts_fs.h | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 284749fb0f6b..1fc80ea87c13 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -793,6 +793,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) struct tty_struct *tty; struct path *pts_path; struct dentry *dentry; + struct vfsmount *mnt; int retval; int index; @@ -805,7 +806,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) return retval; - fsi = devpts_acquire(filp); + fsi = devpts_acquire(filp, &mnt); if (IS_ERR(fsi)) { retval = PTR_ERR(fsi); goto out_free_file; @@ -849,7 +850,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) pts_path = kmalloc(sizeof(struct path), GFP_KERNEL); if (!pts_path) goto err_release; - pts_path->mnt = filp->f_path.mnt; + pts_path->mnt = mnt; pts_path->dentry = dentry; path_get(pts_path); tty->link->driver_data = pts_path; @@ -866,6 +867,7 @@ err_path_put: path_put(pts_path); kfree(pts_path); err_release: + mntput(mnt); tty_unlock(tty); // This will also put-ref the fsi tty_release(inode, filp); @@ -874,6 +876,7 @@ out: devpts_kill_index(fsi, index); out_put_fsi: devpts_release(fsi); + mntput(mnt); out_free_file: tty_free_file(filp); return retval; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 108df2e3602c..44dfbca9306f 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -133,7 +133,7 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } -struct pts_fs_info *devpts_acquire(struct file *filp) +struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) { struct pts_fs_info *result; struct path path; @@ -142,6 +142,7 @@ struct pts_fs_info *devpts_acquire(struct file *filp) path = filp->f_path; path_get(&path); + *ptsmnt = NULL; /* Has the devpts filesystem already been found? */ sb = path.mnt->mnt_sb; @@ -165,6 +166,7 @@ struct pts_fs_info *devpts_acquire(struct file *filp) * pty code needs to hold extra references in case of last /dev/tty close */ atomic_inc(&sb->s_active); + *ptsmnt = mntget(path.mnt); result = DEVPTS_SB(sb); out: diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 277ab9af9ac2..7883e901f65c 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -19,7 +19,7 @@ struct pts_fs_info; -struct pts_fs_info *devpts_acquire(struct file *); +struct pts_fs_info *devpts_acquire(struct file *, struct vfsmount **ptsmnt); void devpts_release(struct pts_fs_info *); int devpts_new_index(struct pts_fs_info *); -- cgit v1.2.3-71-gd317 From 7edaeb6841dfb27e362288ab8466ebdc4972e867 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Aug 2017 09:50:13 +0200 Subject: kernel/watchdog: Prevent false positives with turbo modes The hardlockup detector on x86 uses a performance counter based on unhalted CPU cycles and a periodic hrtimer. The hrtimer period is about 2/5 of the performance counter period, so the hrtimer should fire 2-3 times before the performance counter NMI fires. The NMI code checks whether the hrtimer fired since the last invocation. If not, it assumess a hard lockup. The calculation of those periods is based on the nominal CPU frequency. Turbo modes increase the CPU clock frequency and therefore shorten the period of the perf/NMI watchdog. With extreme Turbo-modes (3x nominal frequency) the perf/NMI period is shorter than the hrtimer period which leads to false positives. A simple fix would be to shorten the hrtimer period, but that comes with the side effect of more frequent hrtimer and softlockup thread wakeups, which is not desired. Implement a low pass filter, which checks the perf/NMI period against kernel time. If the perf/NMI fires before 4/5 of the watchdog period has elapsed then the event is ignored and postponed to the next perf/NMI. That solves the problem and avoids the overhead of shorter hrtimer periods and more frequent softlockup thread wakeups. Fixes: 58687acba592 ("lockup_detector: Combine nmi_watchdog and softlockup detector") Reported-and-tested-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: dzickus@redhat.com Cc: prarit@redhat.com Cc: ak@linux.intel.com Cc: babu.moger@oracle.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: acme@redhat.com Cc: stable@vger.kernel.org Cc: atomlin@redhat.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708150931310.1886@nanos --- arch/x86/Kconfig | 1 + include/linux/nmi.h | 8 +++++++ kernel/watchdog.c | 1 + kernel/watchdog_hld.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 7 ++++++ 5 files changed, 76 insertions(+) (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9e..9101bfc85539 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -100,6 +100,7 @@ config X86 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 8aa01fd859fb..a36abe2da13e 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -168,6 +168,14 @@ extern int sysctl_hardlockup_all_cpu_backtrace; #define sysctl_softlockup_all_cpu_backtrace 0 #define sysctl_hardlockup_all_cpu_backtrace 0 #endif + +#if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \ + defined(CONFIG_HARDLOCKUP_DETECTOR) +void watchdog_update_hrtimer_threshold(u64 period); +#else +static inline void watchdog_update_hrtimer_threshold(u64 period) { } +#endif + extern bool is_hardlockup(void); struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 06d3389bca0d..f5d52024f6b7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -240,6 +240,7 @@ static void set_sample_period(void) * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + watchdog_update_hrtimer_threshold(sample_period); } /* Commands for resetting the watchdog */ diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 295a0d84934c..3a09ea1b1d3d 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void) } EXPORT_SYMBOL(arch_touch_nmi_watchdog); +#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP +static DEFINE_PER_CPU(ktime_t, last_timestamp); +static DEFINE_PER_CPU(unsigned int, nmi_rearmed); +static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; + +void watchdog_update_hrtimer_threshold(u64 period) +{ + /* + * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 + * + * So it runs effectively with 2.5 times the rate of the NMI + * watchdog. That means the hrtimer should fire 2-3 times before + * the NMI watchdog expires. The NMI watchdog on x86 is based on + * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles + * might run way faster than expected and the NMI fires in a + * smaller period than the one deduced from the nominal CPU + * frequency. Depending on the Turbo-Mode factor this might be fast + * enough to get the NMI period smaller than the hrtimer watchdog + * period and trigger false positives. + * + * The sample threshold is used to check in the NMI handler whether + * the minimum time between two NMI samples has elapsed. That + * prevents false positives. + * + * Set this to 4/5 of the actual watchdog threshold period so the + * hrtimer is guaranteed to fire at least once within the real + * watchdog threshold. + */ + watchdog_hrtimer_sample_threshold = period * 2; +} + +static bool watchdog_check_timestamp(void) +{ + ktime_t delta, now = ktime_get_mono_fast_ns(); + + delta = now - __this_cpu_read(last_timestamp); + if (delta < watchdog_hrtimer_sample_threshold) { + /* + * If ktime is jiffies based, a stalled timer would prevent + * jiffies from being incremented and the filter would look + * at a stale timestamp and never trigger. + */ + if (__this_cpu_inc_return(nmi_rearmed) < 10) + return false; + } + __this_cpu_write(nmi_rearmed, 0); + __this_cpu_write(last_timestamp, now); + return true; +} +#else +static inline bool watchdog_check_timestamp(void) +{ + return true; +} +#endif + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } + if (!watchdog_check_timestamp()) + return; + /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 98fe715522e8..c617b9d1d6cb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -797,6 +797,13 @@ config HARDLOCKUP_DETECTOR_PERF bool select SOFTLOCKUP_DETECTOR +# +# Enables a timestamp based low pass filter to compensate for perf based +# hard lockup detection which runs too fast due to turbo modes. +# +config HARDLOCKUP_CHECK_TIMESTAMP + bool + # # arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard # lockup detector rather than the perf based detector. -- cgit v1.2.3-71-gd317 From a0917e0bc6efc05834c0c1eafebd579a9c75e6e9 Mon Sep 17 00:00:00 2001 From: Matthew Dawson Date: Fri, 18 Aug 2017 15:04:54 -0400 Subject: datagram: When peeking datagrams with offset < 0 don't skip empty skbs Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove headers from UDP packets before queueing"), when udp packets are being peeked the requested extra offset is always 0 as there is no need to skip the udp header. However, when the offset is 0 and the next skb is of length 0, it is only returned once. The behaviour can be seen with the following python script: from socket import *; f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0); g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0); f.bind(('::', 0)); addr=('::1', f.getsockname()[1]); g.sendto(b'', addr) g.sendto(b'b', addr) print(f.recvfrom(10, MSG_PEEK)); print(f.recvfrom(10, MSG_PEEK)); Where the expected output should be the empty string twice. Instead, make sk_peek_offset return negative values, and pass those values to __skb_try_recv_datagram/__skb_try_recv_from_queue. If the passed offset to __skb_try_recv_from_queue is negative, the checked skb is never skipped. __skb_try_recv_from_queue will then ensure the offset is reset back to 0 if a peek is requested without an offset, unless no packets are found. Also simplify the if condition in __skb_try_recv_from_queue. If _off is greater then 0, and off is greater then or equal to skb->len, then (_off || skb->len) must always be true assuming skb->len >= 0 is always true. Also remove a redundant check around a call to sk_peek_offset in af_unix.c, as it double checked if MSG_PEEK was set in the flags. V2: - Moved the negative fixup into __skb_try_recv_from_queue, and remove now redundant checks - Fix peeking in udp{,v6}_recvmsg to report the right value when the offset is 0 V3: - Marked new branch in __skb_try_recv_from_queue as unlikely. Signed-off-by: Matthew Dawson Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 4 +--- net/core/datagram.c | 12 +++++++++--- net/ipv4/udp.c | 3 ++- net/ipv6/udp.c | 3 ++- net/unix/af_unix.c | 5 +---- 5 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 7c0632c7e870..aeeec62992ca 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -507,9 +507,7 @@ int sk_set_peek_off(struct sock *sk, int val); static inline int sk_peek_offset(struct sock *sk, int flags) { if (unlikely(flags & MSG_PEEK)) { - s32 off = READ_ONCE(sk->sk_peek_off); - if (off >= 0) - return off; + return READ_ONCE(sk->sk_peek_off); } return 0; diff --git a/net/core/datagram.c b/net/core/datagram.c index ee5647bd91b3..a21ca8dee5ea 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -169,14 +169,20 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, int *peeked, int *off, int *err, struct sk_buff **last) { + bool peek_at_off = false; struct sk_buff *skb; - int _off = *off; + int _off = 0; + + if (unlikely(flags & MSG_PEEK && *off >= 0)) { + peek_at_off = true; + _off = *off; + } *last = queue->prev; skb_queue_walk(queue, skb) { if (flags & MSG_PEEK) { - if (_off >= skb->len && (skb->len || _off || - skb->peeked)) { + if (peek_at_off && _off >= skb->len && + (_off || skb->peeked)) { _off -= skb->len; continue; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a7c804f73990..cd1d044a7fa5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1574,7 +1574,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, return ip_recv_error(sk, msg, len, addr_len); try_again: - peeking = off = sk_peek_offset(sk, flags); + peeking = flags & MSG_PEEK; + off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 578142b7ca3e..20039c8501eb 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -362,7 +362,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: - peeking = off = sk_peek_offset(sk, flags); + peeking = flags & MSG_PEEK; + off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7b52a380d710..be8982b4f8c0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, */ mutex_lock(&u->iolock); - if (flags & MSG_PEEK) - skip = sk_peek_offset(sk, flags); - else - skip = 0; + skip = max(sk_peek_offset(sk, flags), 0); do { int chunk; -- cgit v1.2.3-71-gd317 From 739f79fc9db1b38f96b5a5109b247a650fbebf6d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 18 Aug 2017 15:15:48 -0700 Subject: mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback() Jaegeuk and Brad report a NULL pointer crash when writeback ending tries to update the memcg stats: BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0 IP: test_clear_page_writeback+0x12e/0x2c0 [...] RIP: 0010:test_clear_page_writeback+0x12e/0x2c0 Call Trace: end_page_writeback+0x47/0x70 f2fs_write_end_io+0x76/0x180 [f2fs] bio_endio+0x9f/0x120 blk_update_request+0xa8/0x2f0 scsi_end_request+0x39/0x1d0 scsi_io_completion+0x211/0x690 scsi_finish_command+0xd9/0x120 scsi_softirq_done+0x127/0x150 __blk_mq_complete_request_remote+0x13/0x20 flush_smp_call_function_queue+0x56/0x110 generic_smp_call_function_single_interrupt+0x13/0x30 smp_call_function_single_interrupt+0x27/0x40 call_function_single_interrupt+0x89/0x90 RIP: 0010:native_safe_halt+0x6/0x10 (gdb) l *(test_clear_page_writeback+0x12e) 0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619). 614 mod_node_page_state(page_pgdat(page), idx, val); 615 if (mem_cgroup_disabled() || !page->mem_cgroup) 616 return; 617 mod_memcg_state(page->mem_cgroup, idx, val); 618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; 619 this_cpu_add(pn->lruvec_stat->count[idx], val); 620 } 621 622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 623 gfp_t gfp_mask, The issue is that writeback doesn't hold a page reference and the page might get freed after PG_writeback is cleared (and the mapping is unlocked) in test_clear_page_writeback(). The stat functions looking up the page's node or zone are safe, as those attributes are static across allocation and free cycles. But page->mem_cgroup is not, and it will get cleared if we race with truncation or migration. It appears this race window has been around for a while, but less likely to trigger when the memcg stats were updated first thing after PG_writeback is cleared. Recent changes reshuffled this code to update the global node stats before the memcg ones, though, stretching the race window out to an extent where people can reproduce the problem. Update test_clear_page_writeback() to look up and pin page->mem_cgroup before clearing PG_writeback, then not use that pointer afterward. It is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()") but leaves the pageref-holding callsites that aren't affected alone. Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()") Signed-off-by: Johannes Weiner Reported-by: Jaegeuk Kim Tested-by: Jaegeuk Kim Reported-by: Bradley Bolen Tested-by: Brad Bolen Cc: Vladimir Davydov Cc: Michal Hocko Cc: [4.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++-- mm/memcontrol.c | 43 +++++++++++++++++++++++++++++++------------ mm/page-writeback.c | 15 ++++++++++++--- 3 files changed, 51 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3914e3dd6168..9b15a4bcfa77 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -484,7 +484,8 @@ bool mem_cgroup_oom_synchronize(bool wait); extern int do_swap_account; #endif -void lock_page_memcg(struct page *page); +struct mem_cgroup *lock_page_memcg(struct page *page); +void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, @@ -809,7 +810,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void lock_page_memcg(struct page *page) +static inline struct mem_cgroup *lock_page_memcg(struct page *page) +{ + return NULL; +} + +static inline void __unlock_page_memcg(struct mem_cgroup *memcg) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3df3c04d73ab..e09741af816f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1611,9 +1611,13 @@ cleanup: * @page: the page * * This function protects unlocked LRU pages from being moved to - * another cgroup and stabilizes their page->mem_cgroup binding. + * another cgroup. + * + * It ensures lifetime of the returned memcg. Caller is responsible + * for the lifetime of the page; __unlock_page_memcg() is available + * when @page might get freed inside the locked section. */ -void lock_page_memcg(struct page *page) +struct mem_cgroup *lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - */ + * + * The RCU lock also protects the memcg from being freed when + * the page state that is going to change is the only thing + * preventing the page itself from being freed. E.g. writeback + * doesn't hold a page reference and relies on PG_writeback to + * keep off truncation, migration and so forth. + */ rcu_read_lock(); if (mem_cgroup_disabled()) - return; + return NULL; again: memcg = page->mem_cgroup; if (unlikely(!memcg)) - return; + return NULL; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { @@ -1649,18 +1659,18 @@ again: memcg->move_lock_task = current; memcg->move_lock_flags = flags; - return; + return memcg; } EXPORT_SYMBOL(lock_page_memcg); /** - * unlock_page_memcg - unlock a page->mem_cgroup binding - * @page: the page + * __unlock_page_memcg - unlock and unpin a memcg + * @memcg: the memcg + * + * Unlock and unpin a memcg returned by lock_page_memcg(). */ -void unlock_page_memcg(struct page *page) +void __unlock_page_memcg(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg = page->mem_cgroup; - if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page) rcu_read_unlock(); } + +/** + * unlock_page_memcg - unlock a page->mem_cgroup binding + * @page: the page + */ +void unlock_page_memcg(struct page *page) +{ + __unlock_page_memcg(page->mem_cgroup); +} EXPORT_SYMBOL(unlock_page_memcg); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 96e93b214d31..bf050ab025b7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); + struct mem_cgroup *memcg; + struct lruvec *lruvec; int ret; - lock_page_memcg(page); + memcg = lock_page_memcg(page); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } + /* + * NOTE: Page might be free now! Writeback doesn't hold a page + * reference on its own, it relies on truncation to wait for + * the clearing of PG_writeback. The below can only access + * page state that is static across allocation cycles. + */ if (ret) { - dec_lruvec_page_state(page, NR_WRITEBACK); + dec_lruvec_state(lruvec, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } - unlock_page_memcg(page); + __unlock_page_memcg(memcg); return ret; } -- cgit v1.2.3-71-gd317 From 8ada92799ec4de00f4bc0f10b1ededa256c1ab22 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 18 Aug 2017 15:15:55 -0700 Subject: wait: add wait_event_killable_timeout() These are the few pending fixes I have queued up for v4.13-final. One is a a generic regression fix for recursive loops on kmod and the other one is a trivial print out correction. During the v4.13 development we assumed that recursive kmod loops were no longer possible. Clearly that is not true. The regression fix makes use of a new killable wait. We use a killable wait to be paranoid in how signals might be sent to modprobe and only accept a proper SIGKILL. The signal will only be available to userspace to issue *iff* a thread has already entered a wait state, and that happens only if we've already throttled after 50 kmod threads have been hit. Note that although it may seem excessive to trigger a failure afer 5 seconds if all kmod thread remain busy, prior to the series of changes that went into v4.13 we would actually *always* fatally fail any request which came in if the limit was already reached. The new waiting implemented in v4.13 actually gives us *more* breathing room -- the wait for 5 seconds is a wait for *any* kmod thread to finish. We give up and fail *iff* no kmod thread has finished and they're *all* running straight for 5 consecutive seconds. If 50 kmod threads are running consecutively for 5 seconds something else must be really bad. Recursive loops with kmod are bad but they're also hard to implement properly as a selftest without currently fooling current userspace tools like kmod [1]. For instance kmod will complain when you run depmod if it finds a recursive loop with symbol dependency between modules as such this type of recursive loop cannot go upstream as the modules_install target will fail after running depmod. These tests already exist on userspace kmod upstream though (refer to the testsuite/module-playground/mod-loop-*.c files). The same is not true if request_module() is used though, or worst if aliases are used. Likewise the issue with 64-bit kernels booting 32-bit userspace without a binfmt handler built-in is also currently not detected and proactively avoided by userspace kmod tools, or kconfig for all architectures. Although we could complain in the kernel when some of these individual recursive issues creep up, proactively avoiding these situations in userspace at build time is what we should keep striving for. Lastly, since recursive loops could happen with kmod it may mean recursive loops may also be possible with other kernel usermode helpers, this should be investigated and long term if we can come up with a more sensible generic solution even better! [0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git/log/?h=20170809-kmod-for-v4.13-final [1] https://git.kernel.org/pub/scm/utils/kernel/kmod/kmod.git This patch (of 3): This wait is similar to wait_event_interruptible_timeout() but only accepts SIGKILL interrupt signal. Other signals are ignored. Link: http://lkml.kernel.org/r/20170809234635.13443-2-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Kees Cook Cc: Dmitry Torokhov Cc: Jessica Yu Cc: Rusty Russell Cc: Michal Marek Cc: Petr Mladek Cc: Miroslav Benes Cc: Josh Poimboeuf Cc: "Eric W. Biederman" Cc: Shuah Khan Cc: Matt Redfearn Cc: Dan Carpenter Cc: Colin Ian King Cc: Daniel Mentz Cc: David Binderman Cc: Matt Redfearn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 5b74e36c0ca8..dc19880c02f5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -757,6 +757,43 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); __ret; \ }) +#define __wait_event_killable_timeout(wq_head, condition, timeout) \ + ___wait_event(wq_head, ___wait_cond_timeout(condition), \ + TASK_KILLABLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) + +/** + * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses + * @wq_head: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout, in jiffies + * + * The process is put to sleep (TASK_KILLABLE) until the + * @condition evaluates to true or a kill signal is received. + * The @condition is checked each time the waitqueue @wq_head is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + * + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was + * interrupted by a kill signal. + * + * Only kill signals interrupt this process. + */ +#define wait_event_killable_timeout(wq_head, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_killable_timeout(wq_head, \ + condition, timeout); \ + __ret; \ +}) + #define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ -- cgit v1.2.3-71-gd317 From 3010f876500f9ba921afaeccec30c45ca6584dc8 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 18 Aug 2017 15:16:05 -0700 Subject: mm: discard memblock data later There is existing use after free bug when deferred struct pages are enabled: The memblock_add() allocates memory for the memory array if more than 128 entries are needed. See comment in e820__memblock_setup(): * The bootstrap memblock region count maximum is 128 entries * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries * than that - so allow memblock resizing. This memblock memory is freed here: free_low_memory_core_early() We access the freed memblock.memory later in boot when deferred pages are initialized in this path: deferred_init_memmap() for_each_mem_pfn_range() __next_mem_pfn_range() type = &memblock.memory; One possible explanation for why this use-after-free hasn't been hit before is that the limit of INIT_MEMBLOCK_REGIONS has never been exceeded at least on systems where deferred struct pages were enabled. Tested by reducing INIT_MEMBLOCK_REGIONS down to 4 from the current 128, and verifying in qemu that this code is getting excuted and that the freed pages are sane. Link: http://lkml.kernel.org/r/1502485554-318703-2-git-send-email-pasha.tatashin@oracle.com Fixes: 7e18adb4f80b ("mm: meminit: initialise remaining struct pages in parallel with kswapd") Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Acked-by: Michal Hocko Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 6 ++++-- mm/memblock.c | 38 +++++++++++++++++--------------------- mm/nobootmem.c | 16 ---------------- mm/page_alloc.c | 4 ++++ 4 files changed, 25 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 77d427974f57..bae11c7e7bf3 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -61,6 +61,7 @@ extern int memblock_debug; #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK #define __init_memblock __meminit #define __initdata_memblock __meminitdata +void memblock_discard(void); #else #define __init_memblock #define __initdata_memblock @@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, int nid, ulong flags); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); -phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); -phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr); void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); @@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags, void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, phys_addr_t *out_end); +void __memblock_free_early(phys_addr_t base, phys_addr_t size); +void __memblock_free_late(phys_addr_t base, phys_addr_t size); + /** * for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. diff --git a/mm/memblock.c b/mm/memblock.c index 2cb25fe4452c..bf14aea6ab70 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK - -phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( - phys_addr_t *addr) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - *addr = __pa(memblock.reserved.regions); - - return PAGE_ALIGN(sizeof(struct memblock_region) * - memblock.reserved.max); -} - -phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info( - phys_addr_t *addr) +/** + * Discard memory and reserved arrays if they were allocated + */ +void __init memblock_discard(void) { - if (memblock.memory.regions == memblock_memory_init_regions) - return 0; + phys_addr_t addr, size; - *addr = __pa(memblock.memory.regions); + if (memblock.reserved.regions != memblock_reserved_init_regions) { + addr = __pa(memblock.reserved.regions); + size = PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); + __memblock_free_late(addr, size); + } - return PAGE_ALIGN(sizeof(struct memblock_region) * - memblock.memory.max); + if (memblock.memory.regions == memblock_memory_init_regions) { + addr = __pa(memblock.memory.regions); + size = PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.memory.max); + __memblock_free_late(addr, size); + } } - #endif /** diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 36454d0f96ee..3637809a18d0 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void) NULL) count += __free_memory_core(start, end); -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK - { - phys_addr_t size; - - /* Free memblock.reserved array if it was allocated */ - size = get_allocated_memblock_reserved_regions_info(&start); - if (size) - count += __free_memory_core(start, start + size); - - /* Free memblock.memory array if it was allocated */ - size = get_allocated_memblock_memory_regions_info(&start); - if (size) - count += __free_memory_core(start, start + size); - } -#endif - return count; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d00f746c2fd..1bad301820c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1584,6 +1584,10 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ + memblock_discard(); +#endif for_each_populated_zone(zone) set_zone_contiguous(zone); -- cgit v1.2.3-71-gd317 From 6b31d5955cb29a51c5baffee382f213d75e98fb8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 18 Aug 2017 15:16:15 -0700 Subject: mm, oom: fix potential data corruption when oom_reaper races with writer Wenwei Tao has noticed that our current assumption that the oom victim is dying and never doing any visible changes after it dies, and so the oom_reaper can tear it down, is not entirely true. __task_will_free_mem consider a task dying when SIGNAL_GROUP_EXIT is set but do_group_exit sends SIGKILL to all threads _after_ the flag is set. So there is a race window when some threads won't have fatal_signal_pending while the oom_reaper could start unmapping the address space. Moreover some paths might not check for fatal signals before each PF/g-u-p/copy_from_user. We already have a protection for oom_reaper vs. PF races by checking MMF_UNSTABLE. This has been, however, checked only for kernel threads (use_mm users) which can outlive the oom victim. A simple fix would be to extend the current check in handle_mm_fault for all tasks but that wouldn't be sufficient because the current check assumes that a kernel thread would bail out after EFAULT from get_user*/copy_from_user and never re-read the same address which would succeed because the PF path has established page tables already. This seems to be the case for the only existing use_mm user currently (virtio driver) but it is rather fragile in general. This is even more fragile in general for more complex paths such as generic_perform_write which can re-read the same address more times (e.g. iov_iter_copy_from_user_atomic to fail and then iov_iter_fault_in_readable on retry). Therefore we have to implement MMF_UNSTABLE protection in a robust way and never make a potentially corrupted content visible. That requires to hook deeper into the PF path and check for the flag _every time_ before a pte for anonymous memory is established (that means all !VM_SHARED mappings). The corruption can be triggered artificially (http://lkml.kernel.org/r/201708040646.v746kkhC024636@www262.sakura.ne.jp) but there doesn't seem to be any real life bug report. The race window should be quite tight to trigger most of the time. Link: http://lkml.kernel.org/r/20170807113839.16695-3-mhocko@kernel.org Fixes: aac453635549 ("mm, oom: introduce oom reaper") Signed-off-by: Michal Hocko Reported-by: Wenwei Tao Tested-by: Tetsuo Handa Cc: "Kirill A. Shutemov" Cc: Andrea Argangeli Cc: David Rientjes Cc: Oleg Nesterov Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 22 ++++++++++++++++++++++ mm/huge_memory.c | 30 ++++++++++++++++++++++-------- mm/memory.c | 46 ++++++++++++++++++++-------------------------- 3 files changed, 64 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/oom.h b/include/linux/oom.h index 8a266e2be5a6..76aac4ce39bc 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -6,6 +6,8 @@ #include #include #include +#include /* MMF_* */ +#include /* VM_FAULT* */ struct zonelist; struct notifier_block; @@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } +/* + * Checks whether a page fault on the given mm is still reliable. + * This is no longer true if the oom reaper started to reap the + * address space which is reflected by MMF_UNSTABLE flag set in + * the mm. At that moment any !shared mapping would lose the content + * and could cause a memory corruption (zero pages instead of the + * original content). + * + * User should call this before establishing a page table entry for + * a !shared mapping and under the proper page table lock. + * + * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise. + */ +static inline int check_stable_address_space(struct mm_struct *mm) +{ + if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags))) + return VM_FAULT_SIGBUS; + return 0; +} + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 216114f6ef0b..90731e3b7e58 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + int ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) { - mem_cgroup_cancel_charge(page, memcg, true); - put_page(page); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto release; } clear_huge_page(page, haddr, HPAGE_PMD_NR); @@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { - spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, true); - put_page(page); - pte_free(vma->vm_mm, pgtable); + goto unlock_release; } else { pmd_t entry; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { int ret; @@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, } return 0; +unlock_release: + spin_unlock(vmf->ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + mem_cgroup_cancel_charge(page, memcg, true); + put_page(page); + return ret; + } /* @@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) ret = 0; set = false; if (pmd_none(*vmf->pmd)) { - if (userfaultfd_missing(vma)) { + ret = check_stable_address_space(vma->vm_mm); + if (ret) { + spin_unlock(vmf->ptl); + } else if (userfaultfd_missing(vma)) { spin_unlock(vmf->ptl); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); diff --git a/mm/memory.c b/mm/memory.c index c717b5bcc80e..fe2fba27ded2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; + int ret = 0; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) goto unlock; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf) if (!pte_none(*vmf->pte)) goto release; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2978,7 +2987,7 @@ setpte: update_mmu_cache(vma, vmf->address, vmf->pte); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; + return ret; release: mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, int finish_fault(struct vm_fault *vmf) { struct page *page; - int ret; + int ret = 0; /* Did we COW the page? */ if ((vmf->flags & FAULT_FLAG_WRITE) && @@ -3260,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf) page = vmf->cow_page; else page = vmf->page; - ret = alloc_set_pte(vmf, vmf->memcg, page); + + /* + * check even for read faults because we might have lost our CoWed + * page + */ + if (!(vmf->vma->vm_flags & VM_SHARED)) + ret = check_stable_address_space(vmf->vma->vm_mm); + if (!ret) + ret = alloc_set_pte(vmf, vmf->memcg, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; @@ -3900,29 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_oom_synchronize(false); } - /* - * This mm has been already reaped by the oom reaper and so the - * refault cannot be trusted in general. Anonymous refaults would - * lose data and give a zero page instead e.g. This is especially - * problem for use_mm() because regular tasks will just die and - * the corrupted data will not be visible anywhere while kthread - * will outlive the oom victim and potentially propagate the date - * further. - */ - if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) - && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) { - - /* - * We are going to enforce SIGBUS but the PF path might have - * dropped the mmap_sem already so take it again so that - * we do not break expectations of all arch specific PF paths - * and g-u-p - */ - if (ret & VM_FAULT_RETRY) - down_read(&vma->vm_mm->mmap_sem); - ret = VM_FAULT_SIGBUS; - } - return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); -- cgit v1.2.3-71-gd317 From f00fd7ae4f409abb7b2e5d099248832548199f0c Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Sun, 30 Jul 2017 16:14:42 -0600 Subject: PATCH] iio: Fix some documentation warnings The kerneldoc description for the trig_readonly field of struct iio_dev lacked a colon, leading to this doc build warning: ./include/linux/iio/iio.h:603: warning: No description found for parameter 'trig_readonly' A similar issue for iio_trigger_set_immutable() in trigger.h yielded: ./include/linux/iio/trigger.h:151: warning: No description found for parameter 'indio_dev' ./include/linux/iio/trigger.h:151: warning: No description found for parameter 'trig' Fix the formatting and silence the warnings. Signed-off-by: Jonathan Corbet Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 2 +- include/linux/iio/trigger.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index d68bec297a45..c380daa40c0e 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -535,7 +535,7 @@ struct iio_buffer_setup_ops { * @scan_timestamp: [INTERN] set if any buffers have requested timestamp * @scan_index_timestamp:[INTERN] cache of the index to the timestamp * @trig: [INTERN] current device trigger (buffer modes) - * @trig_readonly [INTERN] mark the current trigger immutable + * @trig_readonly: [INTERN] mark the current trigger immutable * @pollfunc: [DRIVER] function run on trigger being received * @pollfunc_event: [DRIVER] function run on events trigger being received * @channels: [DRIVER] channel specification structure table diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h index ea08302f2d7b..7142d8d6e470 100644 --- a/include/linux/iio/trigger.h +++ b/include/linux/iio/trigger.h @@ -144,8 +144,8 @@ void devm_iio_trigger_unregister(struct device *dev, /** * iio_trigger_set_immutable() - set an immutable trigger on destination * - * @indio_dev - IIO device structure containing the device - * @trig - trigger to assign to device + * @indio_dev: IIO device structure containing the device + * @trig: trigger to assign to device * **/ int iio_trigger_set_immutable(struct iio_dev *indio_dev, struct iio_trigger *trig); -- cgit v1.2.3-71-gd317 From 68a66d149a8c78ec6720f268597302883e48e9fa Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 19 Aug 2017 15:37:07 +0300 Subject: net_sched: fix order of queue length updates in qdisc_replace() This important to call qdisc_tree_reduce_backlog() after changing queue length. Parent qdisc should deactivate class in ->qlen_notify() called from qdisc_tree_reduce_backlog() but this happens only if qdisc->q.qlen in zero. Missed class deactivations leads to crashes/warnings at picking packets from empty qdisc and corrupting state at reactivating this class in future. Signed-off-by: Konstantin Khlebnikov Fixes: 86a7996cc8a0 ("net_sched: introduce qdisc_replace() helper") Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 1c123e2b2415..67f815e5d525 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -806,8 +806,11 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, old = *pold; *pold = new; if (old != NULL) { - qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog); + unsigned int qlen = old->q.qlen; + unsigned int backlog = old->qstats.backlog; + qdisc_reset(old); + qdisc_tree_reduce_backlog(old, qlen, backlog); } sch_tree_unlock(sch); -- cgit v1.2.3-71-gd317 From dd1c1f2f2028a7b851f701fc6a8ebe39dcb95e7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2017 17:35:02 +0200 Subject: pids: make task_tgid_nr_ns() safe This was reported many times, and this was even mentioned in commit 52ee2dfdd4f5 ("pids: refactor vnr/nr_ns helpers to make them safe") but somehow nobody bothered to fix the obvious problem: task_tgid_nr_ns() is not safe because task->group_leader points to nowhere after the exiting task passes exit_notify(), rcu_read_lock() can not help. We really need to change __unhash_process() to nullify group_leader, parent, and real_parent, but this needs some cleanups. Until then we can turn task_tgid_nr_ns() into another user of __task_pid_nr_ns() and fix the problem. Reported-by: Troy Kensinger Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- include/linux/pid.h | 4 +++- include/linux/sched.h | 51 +++++++++++++++++++++++++++------------------------ kernel/pid.c | 11 ++++------- 3 files changed, 34 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/pid.h b/include/linux/pid.h index 4d179316e431..719582744a2e 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -8,7 +8,9 @@ enum pid_type PIDTYPE_PID, PIDTYPE_PGID, PIDTYPE_SID, - PIDTYPE_MAX + PIDTYPE_MAX, + /* only valid to __task_pid_nr_ns() */ + __PIDTYPE_TGID }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2db0bb2..c05ac5f5aa03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1163,13 +1163,6 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk) return tsk->tgid; } -extern pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); - -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return pid_vnr(task_tgid(tsk)); -} - /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. @@ -1185,23 +1178,6 @@ static inline int pid_alive(const struct task_struct *p) return p->pids[PIDTYPE_PID].pid != NULL; } -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) -{ - pid_t pid = 0; - - rcu_read_lock(); - if (pid_alive(tsk)) - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); - rcu_read_unlock(); - - return pid; -} - -static inline pid_t task_ppid_nr(const struct task_struct *tsk) -{ - return task_ppid_nr_ns(tsk, &init_pid_ns); -} - static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); @@ -1223,6 +1199,33 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } +static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns); +} + +static inline pid_t task_tgid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL); +} + +static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) +{ + pid_t pid = 0; + + rcu_read_lock(); + if (pid_alive(tsk)) + pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); + rcu_read_unlock(); + + return pid; +} + +static inline pid_t task_ppid_nr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, &init_pid_ns); +} + /* Obsolete, do not use: */ static inline pid_t task_pgrp_nr(struct task_struct *tsk) { diff --git a/kernel/pid.c b/kernel/pid.c index c69c30d827e5..020dedbdf066 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) + if (type != PIDTYPE_PID) { + if (type == __PIDTYPE_TGID) + type = PIDTYPE_PID; task = task->group_leader; + } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); -- cgit v1.2.3-71-gd317 From 1e6ec9ea89d30739b9447c1860fcb07fc29f3aef Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 23 Aug 2017 14:54:59 -0700 Subject: Revert "loop: support 4k physical blocksize" There's some stuff still up in the air, let's not get stuck with a subpar ABI. I'll follow up with something better for 4.14. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.c | 42 ++++++------------------------------------ drivers/block/loop.h | 1 - include/uapi/linux/loop.h | 3 --- 3 files changed, 6 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ef8334949b42..f321b96405f5 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -221,8 +221,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) } static int -figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit, - loff_t logical_blocksize) +figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) { loff_t size = get_size(offset, sizelimit, lo->lo_backing_file); sector_t x = (sector_t)size; @@ -234,12 +233,6 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit, lo->lo_offset = offset; if (lo->lo_sizelimit != sizelimit) lo->lo_sizelimit = sizelimit; - if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) { - lo->lo_logical_blocksize = logical_blocksize; - blk_queue_physical_block_size(lo->lo_queue, lo->lo_blocksize); - blk_queue_logical_block_size(lo->lo_queue, - lo->lo_logical_blocksize); - } set_capacity(lo->lo_disk, x); bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9); /* let user-space know about the new size */ @@ -820,7 +813,6 @@ static void loop_config_discard(struct loop_device *lo) struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; struct request_queue *q = lo->lo_queue; - int lo_bits = 9; /* * We use punch hole to reclaim the free space used by the @@ -840,11 +832,9 @@ static void loop_config_discard(struct loop_device *lo) q->limits.discard_granularity = inode->i_sb->s_blocksize; q->limits.discard_alignment = 0; - if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) - lo_bits = blksize_bits(lo->lo_logical_blocksize); - blk_queue_max_discard_sectors(q, UINT_MAX >> lo_bits); - blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> lo_bits); + blk_queue_max_discard_sectors(q, UINT_MAX >> 9); + blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); } @@ -938,7 +928,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->use_dio = false; lo->lo_blocksize = lo_blocksize; - lo->lo_logical_blocksize = 512; lo->lo_device = bdev; lo->lo_flags = lo_flags; lo->lo_backing_file = file; @@ -1104,7 +1093,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) int err; struct loop_func_table *xfer; kuid_t uid = current_uid(); - int lo_flags = lo->lo_flags; if (lo->lo_encrypt_key_size && !uid_eq(lo->lo_key_owner, uid) && @@ -1137,26 +1125,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (err) goto exit; - if (info->lo_flags & LO_FLAGS_BLOCKSIZE) { - if (!(lo->lo_flags & LO_FLAGS_BLOCKSIZE)) - lo->lo_logical_blocksize = 512; - lo->lo_flags |= LO_FLAGS_BLOCKSIZE; - if (LO_INFO_BLOCKSIZE(info) != 512 && - LO_INFO_BLOCKSIZE(info) != 1024 && - LO_INFO_BLOCKSIZE(info) != 2048 && - LO_INFO_BLOCKSIZE(info) != 4096) - return -EINVAL; - if (LO_INFO_BLOCKSIZE(info) > lo->lo_blocksize) - return -EINVAL; - } - if (lo->lo_offset != info->lo_offset || - lo->lo_sizelimit != info->lo_sizelimit || - lo->lo_flags != lo_flags || - ((lo->lo_flags & LO_FLAGS_BLOCKSIZE) && - lo->lo_logical_blocksize != LO_INFO_BLOCKSIZE(info))) { - if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit, - LO_INFO_BLOCKSIZE(info))) { + lo->lo_sizelimit != info->lo_sizelimit) { + if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { err = -EFBIG; goto exit; } @@ -1348,8 +1319,7 @@ static int loop_set_capacity(struct loop_device *lo) if (unlikely(lo->lo_state != Lo_bound)) return -ENXIO; - return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit, - lo->lo_logical_blocksize); + return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); } static int loop_set_dio(struct loop_device *lo, unsigned long arg) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 2c096b9a17b8..fecd3f97ef8c 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -49,7 +49,6 @@ struct loop_device { struct file * lo_backing_file; struct block_device *lo_device; unsigned lo_blocksize; - unsigned lo_logical_blocksize; void *key_data; gfp_t old_gfp_mask; diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h index a3960f98679c..c8125ec1f4f2 100644 --- a/include/uapi/linux/loop.h +++ b/include/uapi/linux/loop.h @@ -22,7 +22,6 @@ enum { LO_FLAGS_AUTOCLEAR = 4, LO_FLAGS_PARTSCAN = 8, LO_FLAGS_DIRECT_IO = 16, - LO_FLAGS_BLOCKSIZE = 32, }; #include /* for __kernel_old_dev_t */ @@ -60,8 +59,6 @@ struct loop_info64 { __u64 lo_init[2]; }; -#define LO_INFO_BLOCKSIZE(l) (l)->lo_init[0] - /* * Loop filter types */ -- cgit v1.2.3-71-gd317 From 143c97cc652949893c8056c679012f0aeccb80e5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 23 Aug 2017 18:16:11 -0700 Subject: Revert "pty: fix the cached path of the pty slave file descriptor in the master" This reverts commit c8c03f1858331e85d397bacccd34ef409aae993c. It turns out that while fixing the ptmx file descriptor to have the correct 'struct path' to the associated slave pty is a really good thing, it breaks some user space tools for a very annoying reason. The problem is that /dev/ptmx and its associated slave pty (/dev/pts/X) are on different mounts. That was what caused us to have the wrong path in the first place (we would mix up the vfsmount of the 'ptmx' node, with the dentry of the pty slave node), but it also means that now while we use the right vfsmount, having the pty master open also keeps the pts mount busy. And it turn sout that that makes 'pbuilder' very unhappy, as noted by Stefan Lippers-Hollmann: "This patch introduces a regression for me when using pbuilder 0.228.7[2] (a helper to build Debian packages in a chroot and to create and update its chroots) when trying to umount /dev/ptmx (inside the chroot) on Debian/ unstable (full log and pbuilder configuration file[3] attached). [...] Setting up build-essential (12.3) ... Processing triggers for libc-bin (2.24-15) ... I: unmounting dev/ptmx filesystem W: Could not unmount dev/ptmx: umount: /var/cache/pbuilder/build/1340/dev/ptmx: target is busy (In some cases useful info about processes that use the device is found by lsof(8) or fuser(1).)" apparently pbuilder tries to unmount the /dev/pts filesystem while still holding at least one master node open, which is arguably not very nice, but we don't break user space even when fixing other bugs. So this commit has to be reverted. I'll try to figure out a way to avoid caching the path to the slave pty in the master pty. The only thing that actually wants that slave pty path is the "TIOCGPTPEER" ioctl, and I think we could just recreate the path at that time. Reported-by: Stefan Lippers-Hollmann Cc: Eric W Biederman Cc: Christian Brauner Cc: Al Viro Signed-off-by: Linus Torvalds --- drivers/tty/pty.c | 7 ++----- fs/devpts/inode.c | 4 +--- include/linux/devpts_fs.h | 2 +- 3 files changed, 4 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 1fc80ea87c13..284749fb0f6b 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -793,7 +793,6 @@ static int ptmx_open(struct inode *inode, struct file *filp) struct tty_struct *tty; struct path *pts_path; struct dentry *dentry; - struct vfsmount *mnt; int retval; int index; @@ -806,7 +805,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) return retval; - fsi = devpts_acquire(filp, &mnt); + fsi = devpts_acquire(filp); if (IS_ERR(fsi)) { retval = PTR_ERR(fsi); goto out_free_file; @@ -850,7 +849,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) pts_path = kmalloc(sizeof(struct path), GFP_KERNEL); if (!pts_path) goto err_release; - pts_path->mnt = mnt; + pts_path->mnt = filp->f_path.mnt; pts_path->dentry = dentry; path_get(pts_path); tty->link->driver_data = pts_path; @@ -867,7 +866,6 @@ err_path_put: path_put(pts_path); kfree(pts_path); err_release: - mntput(mnt); tty_unlock(tty); // This will also put-ref the fsi tty_release(inode, filp); @@ -876,7 +874,6 @@ out: devpts_kill_index(fsi, index); out_put_fsi: devpts_release(fsi); - mntput(mnt); out_free_file: tty_free_file(filp); return retval; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 44dfbca9306f..108df2e3602c 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -133,7 +133,7 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } -struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) +struct pts_fs_info *devpts_acquire(struct file *filp) { struct pts_fs_info *result; struct path path; @@ -142,7 +142,6 @@ struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) path = filp->f_path; path_get(&path); - *ptsmnt = NULL; /* Has the devpts filesystem already been found? */ sb = path.mnt->mnt_sb; @@ -166,7 +165,6 @@ struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) * pty code needs to hold extra references in case of last /dev/tty close */ atomic_inc(&sb->s_active); - *ptsmnt = mntget(path.mnt); result = DEVPTS_SB(sb); out: diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 7883e901f65c..277ab9af9ac2 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -19,7 +19,7 @@ struct pts_fs_info; -struct pts_fs_info *devpts_acquire(struct file *, struct vfsmount **ptsmnt); +struct pts_fs_info *devpts_acquire(struct file *); void devpts_release(struct pts_fs_info *); int devpts_new_index(struct pts_fs_info *); -- cgit v1.2.3-71-gd317 From 50b4d485528d1dbe0bd249f2073140e3444f4a7b Mon Sep 17 00:00:00 2001 From: Benjamin Block Date: Thu, 24 Aug 2017 01:57:56 +0200 Subject: bsg-lib: fix kernel panic resulting from missing allocation of reply-buffer Since we split the scsi_request out of struct request bsg fails to provide a reply-buffer for the drivers. This was done via the pointer for sense-data, that is not preallocated anymore. Failing to allocate/assign it results in illegal dereferences because LLDs use this pointer unquestioned. An example panic on s390x, using the zFCP driver, looks like this (I had debugging on, otherwise NULL-pointer dereferences wouldn't even panic on s390x): Unable to handle kernel pointer dereference in virtual kernel address space Failing address: 6b6b6b6b6b6b6000 TEID: 6b6b6b6b6b6b6403 Fault in home space mode while using kernel ASCE. AS:0000000001590007 R3:0000000000000024 Oops: 0038 ilc:2 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.12.0-bsg-regression+ #3 Hardware name: IBM 2964 N96 702 (z/VM 6.4.0) task: 0000000065cb0100 task.stack: 0000000065cb4000 Krnl PSW : 0704e00180000000 000003ff801e4156 (zfcp_fc_ct_els_job_handler+0x16/0x58 [zfcp]) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3 Krnl GPRS: 0000000000000001 000000005fa9d0d0 000000005fa9d078 0000000000e16866 000003ff00000290 6b6b6b6b6b6b6b6b 0000000059f78f00 000000000000000f 00000000593a0958 00000000593a0958 0000000060d88800 000000005ddd4c38 0000000058b50100 07000000659cba08 000003ff801e8556 00000000659cb9a8 Krnl Code: 000003ff801e4146: e31020500004 lg %r1,80(%r2) 000003ff801e414c: 58402040 l %r4,64(%r2) #000003ff801e4150: e35020200004 lg %r5,32(%r2) >000003ff801e4156: 50405004 st %r4,4(%r5) 000003ff801e415a: e54c50080000 mvhi 8(%r5),0 000003ff801e4160: e33010280012 lt %r3,40(%r1) 000003ff801e4166: a718fffb lhi %r1,-5 000003ff801e416a: 1803 lr %r0,%r3 Call Trace: ([<000003ff801e8556>] zfcp_fsf_req_complete+0x726/0x768 [zfcp]) [<000003ff801ea82a>] zfcp_fsf_reqid_check+0x102/0x180 [zfcp] [<000003ff801eb980>] zfcp_qdio_int_resp+0x230/0x278 [zfcp] [<00000000009b91b6>] qdio_kick_handler+0x2ae/0x2c8 [<00000000009b9e3e>] __tiqdio_inbound_processing+0x406/0xc10 [<00000000001684c2>] tasklet_action+0x15a/0x1d8 [<0000000000bd28ec>] __do_softirq+0x3ec/0x848 [<00000000001675a4>] irq_exit+0x74/0xf8 [<000000000010dd6a>] do_IRQ+0xba/0xf0 [<0000000000bd19e8>] io_int_handler+0x104/0x2d4 [<00000000001033b6>] enabled_wait+0xb6/0x188 ([<000000000010339e>] enabled_wait+0x9e/0x188) [<000000000010396a>] arch_cpu_idle+0x32/0x50 [<0000000000bd0112>] default_idle_call+0x52/0x68 [<00000000001cd0fa>] do_idle+0x102/0x188 [<00000000001cd41e>] cpu_startup_entry+0x3e/0x48 [<0000000000118c64>] smp_start_secondary+0x11c/0x130 [<0000000000bd2016>] restart_int_handler+0x62/0x78 [<0000000000000000>] (null) INFO: lockdep is turned off. Last Breaking-Event-Address: [<000003ff801e41d6>] zfcp_fc_ct_job_handler+0x3e/0x48 [zfcp] Kernel panic - not syncing: Fatal exception in interrupt This patch moves bsg-lib to allocate and setup struct bsg_job ahead of time, including the allocation of a buffer for the reply-data. This means, struct bsg_job is not allocated separately anymore, but as part of struct request allocation - similar to struct scsi_cmd. Reflect this in the function names that used to handle creation/destruction of struct bsg_job. Reported-by: Steffen Maier Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Benjamin Block Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request") Cc: #4.11+ Signed-off-by: Jens Axboe --- block/bsg-lib.c | 74 +++++++++++++++++++++++++++++-------------------- include/linux/blkdev.h | 1 - include/linux/bsg-lib.h | 2 ++ 3 files changed, 46 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/block/bsg-lib.c b/block/bsg-lib.c index c4513b23f57a..dd56d7460cb9 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -29,26 +29,25 @@ #include /** - * bsg_destroy_job - routine to teardown/delete a bsg job + * bsg_teardown_job - routine to teardown a bsg job * @job: bsg_job that is to be torn down */ -static void bsg_destroy_job(struct kref *kref) +static void bsg_teardown_job(struct kref *kref) { struct bsg_job *job = container_of(kref, struct bsg_job, kref); struct request *rq = job->req; - blk_end_request_all(rq, BLK_STS_OK); - put_device(job->dev); /* release reference for the request */ kfree(job->request_payload.sg_list); kfree(job->reply_payload.sg_list); - kfree(job); + + blk_end_request_all(rq, BLK_STS_OK); } void bsg_job_put(struct bsg_job *job) { - kref_put(&job->kref, bsg_destroy_job); + kref_put(&job->kref, bsg_teardown_job); } EXPORT_SYMBOL_GPL(bsg_job_put); @@ -100,7 +99,7 @@ EXPORT_SYMBOL_GPL(bsg_job_done); */ static void bsg_softirq_done(struct request *rq) { - struct bsg_job *job = rq->special; + struct bsg_job *job = blk_mq_rq_to_pdu(rq); bsg_job_put(job); } @@ -122,33 +121,20 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) } /** - * bsg_create_job - create the bsg_job structure for the bsg request + * bsg_prepare_job - create the bsg_job structure for the bsg request * @dev: device that is being sent the bsg request * @req: BSG request that needs a job structure */ -static int bsg_create_job(struct device *dev, struct request *req) +static int bsg_prepare_job(struct device *dev, struct request *req) { struct request *rsp = req->next_rq; - struct request_queue *q = req->q; struct scsi_request *rq = scsi_req(req); - struct bsg_job *job; + struct bsg_job *job = blk_mq_rq_to_pdu(req); int ret; - BUG_ON(req->special); - - job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL); - if (!job) - return -ENOMEM; - - req->special = job; - job->req = req; - if (q->bsg_job_size) - job->dd_data = (void *)&job[1]; job->request = rq->cmd; job->request_len = rq->cmd_len; - job->reply = rq->sense; - job->reply_len = SCSI_SENSE_BUFFERSIZE; /* Size of sense buffer - * allocated */ + if (req->bio) { ret = bsg_map_buffer(&job->request_payload, req); if (ret) @@ -187,7 +173,6 @@ static void bsg_request_fn(struct request_queue *q) { struct device *dev = q->queuedata; struct request *req; - struct bsg_job *job; int ret; if (!get_device(dev)) @@ -199,7 +184,7 @@ static void bsg_request_fn(struct request_queue *q) break; spin_unlock_irq(q->queue_lock); - ret = bsg_create_job(dev, req); + ret = bsg_prepare_job(dev, req); if (ret) { scsi_req(req)->result = ret; blk_end_request_all(req, BLK_STS_OK); @@ -207,8 +192,7 @@ static void bsg_request_fn(struct request_queue *q) continue; } - job = req->special; - ret = q->bsg_job_fn(job); + ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req)); spin_lock_irq(q->queue_lock); if (ret) break; @@ -219,6 +203,35 @@ static void bsg_request_fn(struct request_queue *q) spin_lock_irq(q->queue_lock); } +static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(req); + struct scsi_request *sreq = &job->sreq; + + memset(job, 0, sizeof(*job)); + + scsi_req_init(sreq); + sreq->sense_len = SCSI_SENSE_BUFFERSIZE; + sreq->sense = kzalloc(sreq->sense_len, gfp); + if (!sreq->sense) + return -ENOMEM; + + job->req = req; + job->reply = sreq->sense; + job->reply_len = sreq->sense_len; + job->dd_data = job + 1; + + return 0; +} + +static void bsg_exit_rq(struct request_queue *q, struct request *req) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(req); + struct scsi_request *sreq = &job->sreq; + + kfree(sreq->sense); +} + /** * bsg_setup_queue - Create and add the bsg hooks so we can receive requests * @dev: device to attach bsg device to @@ -235,7 +248,9 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name, q = blk_alloc_queue(GFP_KERNEL); if (!q) return ERR_PTR(-ENOMEM); - q->cmd_size = sizeof(struct scsi_request); + q->cmd_size = sizeof(struct bsg_job) + dd_job_size; + q->init_rq_fn = bsg_init_rq; + q->exit_rq_fn = bsg_exit_rq; q->request_fn = bsg_request_fn; ret = blk_init_allocated_queue(q); @@ -243,7 +258,6 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name, goto out_cleanup_queue; q->queuedata = dev; - q->bsg_job_size = dd_job_size; q->bsg_job_fn = job_fn; queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 25f6a0cb27d3..2a5d52fa90f5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -568,7 +568,6 @@ struct request_queue { #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; - int bsg_job_size; struct bsg_class_device bsg_dev; #endif diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h index e34dde2da0ef..637a20cfb237 100644 --- a/include/linux/bsg-lib.h +++ b/include/linux/bsg-lib.h @@ -24,6 +24,7 @@ #define _BLK_BSG_ #include +#include struct request; struct device; @@ -37,6 +38,7 @@ struct bsg_buffer { }; struct bsg_job { + struct scsi_request sreq; struct device *dev; struct request *req; -- cgit v1.2.3-71-gd317 From 498ca3c82a7b11e152a46c253f6b2087c929ce00 Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Wed, 23 Aug 2017 08:35:40 +0300 Subject: IB/core: Avoid accessing non-allocated memory when inferring port type Commit 44c58487d51a ("IB/core: Define 'ib' and 'roce' rdma_ah_attr types") introduced the concept of type in ah_attr: * During ib_register_device, each port is checked for its type which is stored in ib_device's port_immutable array. * During uverbs' modify_qp, the type is inferred using the port number in ib_uverbs_qp_dest struct (address vector) by accessing the relevant port_immutable array and the type is passed on to providers. IB spec (version 1.3) enforces a valid port value only in Reset to Init. During Init to RTR, the address vector must be valid but port number is not mentioned as a field in the address vector, so its value is not validated, which leads to accesses to a non-allocated memory when inferring the port type. Save the real port number in ib_qp during modify to Init (when the comp_mask indicates that the port number is valid) and use this value to infer the port type. Avoid copying the address vector fields if the matching bit is not set in the attr_mask. Address vector can't be modified before the port, so no valid flow is affected. Fixes: 44c58487d51a ('IB/core: Define 'ib' and 'roce' rdma_ah_attr types') Signed-off-by: Noa Osherovich Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 11 +++++++---- drivers/infiniband/core/verbs.c | 7 ++++++- include/rdma/ib_verbs.h | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 55822ae71955..739bd69ef1d4 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1522,6 +1522,7 @@ static int create_qp(struct ib_uverbs_file *file, qp->qp_type = attr.qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); + qp->port = 0; if (attr.send_cq) atomic_inc(&attr.send_cq->usecnt); if (attr.recv_cq) @@ -1962,8 +1963,9 @@ static int modify_qp(struct ib_uverbs_file *file, attr->alt_timeout = cmd->base.alt_timeout; attr->rate_limit = cmd->rate_limit; - attr->ah_attr.type = rdma_ah_find_type(qp->device, - cmd->base.dest.port_num); + if (cmd->base.attr_mask & IB_QP_AV) + attr->ah_attr.type = rdma_ah_find_type(qp->device, + cmd->base.dest.port_num); if (cmd->base.dest.is_global) { rdma_ah_set_grh(&attr->ah_attr, NULL, cmd->base.dest.flow_label, @@ -1981,8 +1983,9 @@ static int modify_qp(struct ib_uverbs_file *file, rdma_ah_set_port_num(&attr->ah_attr, cmd->base.dest.port_num); - attr->alt_ah_attr.type = rdma_ah_find_type(qp->device, - cmd->base.dest.port_num); + if (cmd->base.attr_mask & IB_QP_ALT_PATH) + attr->alt_ah_attr.type = + rdma_ah_find_type(qp->device, cmd->base.dest.port_num); if (cmd->base.alt_dest.is_global) { rdma_ah_set_grh(&attr->alt_ah_attr, NULL, cmd->base.alt_dest.flow_label, diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 7f8fe443df46..b456e3ca1876 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -838,6 +838,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, spin_lock_init(&qp->mr_lock); INIT_LIST_HEAD(&qp->rdma_mrs); INIT_LIST_HEAD(&qp->sig_mrs); + qp->port = 0; if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) return ib_create_xrc_qp(qp, qp_init_attr); @@ -1297,7 +1298,11 @@ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, if (ret) return ret; } - return ib_security_modify_qp(qp, attr, attr_mask, udata); + ret = ib_security_modify_qp(qp, attr, attr_mask, udata); + if (!ret && (attr_mask & IB_QP_PORT)) + qp->port = attr->port_num; + + return ret; } EXPORT_SYMBOL(ib_modify_qp_with_udata); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b5732432bb29..88c32aba32f7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1683,6 +1683,7 @@ struct ib_qp { enum ib_qp_type qp_type; struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_qp_security *qp_sec; + u8 port; }; struct ib_mr { -- cgit v1.2.3-71-gd317 From 311fc65c9fb9c966bca8e6f3ff8132ce57344ab9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 24 Aug 2017 15:13:29 -0500 Subject: pty: Repair TIOCGPTPEER The implementation of TIOCGPTPEER has two issues. When /dev/ptmx (as opposed to /dev/pts/ptmx) is opened the wrong vfsmount is passed to dentry_open. Which results in the kernel displaying the wrong pathname for the peer. The second is simply by caching the vfsmount and dentry of the peer it leaves them open, in a way they were not previously Which because of the inreased reference counts can cause unnecessary behaviour differences resulting in regressions. To fix these move the ioctl into tty_io.c at a generic level allowing the ioctl to have access to the struct file on which the ioctl is being called. This allows the path of the slave to be derived when opening the slave through TIOCGPTPEER instead of requiring the path to the slave be cached. Thus removing the need for caching the path. A new function devpts_ptmx_path is factored out of devpts_acquire and used to implement a function devpts_mntget. The new function devpts_mntget takes a filp to perform the lookup on and fsi so that it can confirm that the superblock that is found by devpts_ptmx_path is the proper superblock. v2: Lots of fixes to make the code actually work v3: Suggestions by Linus - Removed the unnecessary initialization of filp in ptm_open_peer - Simplified devpts_ptmx_path as gotos are no longer required [ This is the fix for the issue that was reverted in commit 143c97cc6529, but this time without breaking 'pbuilder' due to increased reference counts - Linus ] Fixes: 54ebbfb16034 ("tty: add TIOCGPTPEER ioctl") Reported-by: Christian Brauner Reported-and-tested-by: Stefan Lippers-Hollmann Signed-off-by: "Eric W. Biederman" Signed-off-by: Linus Torvalds --- drivers/tty/pty.c | 64 ++++++++++++++++++++-------------------------- drivers/tty/tty_io.c | 3 +++ fs/devpts/inode.c | 65 +++++++++++++++++++++++++++++++++++------------ include/linux/devpts_fs.h | 10 ++++++++ 4 files changed, 89 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 284749fb0f6b..a6d5164c33a9 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -69,13 +69,8 @@ static void pty_close(struct tty_struct *tty, struct file *filp) #ifdef CONFIG_UNIX98_PTYS if (tty->driver == ptm_driver) { mutex_lock(&devpts_mutex); - if (tty->link->driver_data) { - struct path *path = tty->link->driver_data; - - devpts_pty_kill(path->dentry); - path_put(path); - kfree(path); - } + if (tty->link->driver_data) + devpts_pty_kill(tty->link->driver_data); mutex_unlock(&devpts_mutex); } #endif @@ -607,25 +602,24 @@ static inline void legacy_pty_init(void) { } static struct cdev ptmx_cdev; /** - * pty_open_peer - open the peer of a pty - * @tty: the peer of the pty being opened + * ptm_open_peer - open the peer of a pty + * @master: the open struct file of the ptmx device node + * @tty: the master of the pty being opened + * @flags: the flags for open * - * Open the cached dentry in tty->link, providing a safe way for userspace - * to get the slave end of a pty (where they have the master fd and cannot - * access or trust the mount namespace /dev/pts was mounted inside). + * Provide a race free way for userspace to open the slave end of a pty + * (where they have the master fd and cannot access or trust the mount + * namespace /dev/pts was mounted inside). */ -static struct file *pty_open_peer(struct tty_struct *tty, int flags) -{ - if (tty->driver->subtype != PTY_TYPE_MASTER) - return ERR_PTR(-EIO); - return dentry_open(tty->link->driver_data, flags, current_cred()); -} - -static int pty_get_peer(struct tty_struct *tty, int flags) +int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags) { int fd = -1; - struct file *filp = NULL; + struct file *filp; int retval = -EINVAL; + struct path path; + + if (tty->driver != ptm_driver) + return -EIO; fd = get_unused_fd_flags(0); if (fd < 0) { @@ -633,7 +627,16 @@ static int pty_get_peer(struct tty_struct *tty, int flags) goto err; } - filp = pty_open_peer(tty, flags); + /* Compute the slave's path */ + path.mnt = devpts_mntget(master, tty->driver_data); + if (IS_ERR(path.mnt)) { + retval = PTR_ERR(path.mnt); + goto err_put; + } + path.dentry = tty->link->driver_data; + + filp = dentry_open(&path, flags, current_cred()); + mntput(path.mnt); if (IS_ERR(filp)) { retval = PTR_ERR(filp); goto err_put; @@ -662,8 +665,6 @@ static int pty_unix98_ioctl(struct tty_struct *tty, return pty_get_pktmode(tty, (int __user *)arg); case TIOCGPTN: /* Get PT Number */ return put_user(tty->index, (unsigned int __user *)arg); - case TIOCGPTPEER: /* Open the other end */ - return pty_get_peer(tty, (int) arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); } @@ -791,7 +792,6 @@ static int ptmx_open(struct inode *inode, struct file *filp) { struct pts_fs_info *fsi; struct tty_struct *tty; - struct path *pts_path; struct dentry *dentry; int retval; int index; @@ -845,26 +845,16 @@ static int ptmx_open(struct inode *inode, struct file *filp) retval = PTR_ERR(dentry); goto err_release; } - /* We need to cache a fake path for TIOCGPTPEER. */ - pts_path = kmalloc(sizeof(struct path), GFP_KERNEL); - if (!pts_path) - goto err_release; - pts_path->mnt = filp->f_path.mnt; - pts_path->dentry = dentry; - path_get(pts_path); - tty->link->driver_data = pts_path; + tty->link->driver_data = dentry; retval = ptm_driver->ops->open(tty, filp); if (retval) - goto err_path_put; + goto err_release; tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); tty_unlock(tty); return 0; -err_path_put: - path_put(pts_path); - kfree(pts_path); err_release: tty_unlock(tty); // This will also put-ref the fsi diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 974b13d24401..10c4038c0e8d 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2518,6 +2518,9 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case TIOCSSERIAL: tty_warn_deprecated_flags(p); break; + case TIOCGPTPEER: + /* Special because the struct file is needed */ + return ptm_open_peer(file, tty, (int)arg); default: retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg); if (retval != -ENOIOCTLCMD) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 108df2e3602c..7eae33ffa3fc 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -133,6 +133,50 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } +static int devpts_ptmx_path(struct path *path) +{ + struct super_block *sb; + int err; + + /* Has the devpts filesystem already been found? */ + if (path->mnt->mnt_sb->s_magic == DEVPTS_SUPER_MAGIC) + return 0; + + /* Is a devpts filesystem at "pts" in the same directory? */ + err = path_pts(path); + if (err) + return err; + + /* Is the path the root of a devpts filesystem? */ + sb = path->mnt->mnt_sb; + if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || + (path->mnt->mnt_root != sb->s_root)) + return -ENODEV; + + return 0; +} + +struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi) +{ + struct path path; + int err; + + path = filp->f_path; + path_get(&path); + + err = devpts_ptmx_path(&path); + dput(path.dentry); + if (err) { + mntput(path.mnt); + path.mnt = ERR_PTR(err); + } + if (DEVPTS_SB(path.mnt->mnt_sb) != fsi) { + mntput(path.mnt); + path.mnt = ERR_PTR(-ENODEV); + } + return path.mnt; +} + struct pts_fs_info *devpts_acquire(struct file *filp) { struct pts_fs_info *result; @@ -143,27 +187,16 @@ struct pts_fs_info *devpts_acquire(struct file *filp) path = filp->f_path; path_get(&path); - /* Has the devpts filesystem already been found? */ - sb = path.mnt->mnt_sb; - if (sb->s_magic != DEVPTS_SUPER_MAGIC) { - /* Is a devpts filesystem at "pts" in the same directory? */ - err = path_pts(&path); - if (err) { - result = ERR_PTR(err); - goto out; - } - - /* Is the path the root of a devpts filesystem? */ - result = ERR_PTR(-ENODEV); - sb = path.mnt->mnt_sb; - if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || - (path.mnt->mnt_root != sb->s_root)) - goto out; + err = devpts_ptmx_path(&path); + if (err) { + result = ERR_PTR(err); + goto out; } /* * pty code needs to hold extra references in case of last /dev/tty close */ + sb = path.mnt->mnt_sb; atomic_inc(&sb->s_active); result = DEVPTS_SB(sb); diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 277ab9af9ac2..100cb4343763 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -19,6 +19,7 @@ struct pts_fs_info; +struct vfsmount *devpts_mntget(struct file *, struct pts_fs_info *); struct pts_fs_info *devpts_acquire(struct file *); void devpts_release(struct pts_fs_info *); @@ -32,6 +33,15 @@ void *devpts_get_priv(struct dentry *); /* unlink */ void devpts_pty_kill(struct dentry *); +/* in pty.c */ +int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags); + +#else +static inline int +ptm_open_peer(struct file *master, struct tty_struct *tty, int flags) +{ + return -EIO; +} #endif -- cgit v1.2.3-71-gd317 From 0cc3b0ec23ce4c69e1e890ed2b8d2fa932b14aad Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 27 Aug 2017 12:12:25 -0700 Subject: Clarify (and fix) MAX_LFS_FILESIZE macros We have a MAX_LFS_FILESIZE macro that is meant to be filled in by filesystems (and other IO targets) that know they are 64-bit clean and don't have any 32-bit limits in their IO path. It turns out that our 32-bit value for that limit was bogus. On 32-bit, the VM layer is limited by the page cache to only 32-bit index values, but our logic for that was confusing and actually wrong. We used to define that value to (((loff_t)PAGE_SIZE << (BITS_PER_LONG-1))-1) which is actually odd in several ways: it limits the index to 31 bits, and then it limits files so that they can't have data in that last byte of a page that has the highest 31-bit index (ie page index 0x7fffffff). Neither of those limitations make sense. The index is actually the full 32 bit unsigned value, and we can use that whole full page. So the maximum size of the file would logically be "PAGE_SIZE << BITS_PER_LONG". However, we do wan tto avoid the maximum index, because we have code that iterates over the page indexes, and we don't want that code to overflow. So the maximum size of a file on a 32-bit host should actually be one page less than the full 32-bit index. So the actual limit is ULONG_MAX << PAGE_SHIFT. That means that we will not actually be using the page of that last index (ULONG_MAX), but we can grow a file up to that limit. The wrong value of MAX_LFS_FILESIZE actually caused problems for Doug Nazar, who was still using a 32-bit host, but with a 9.7TB 2 x RAID5 volume. It turns out that our old MAX_LFS_FILESIZE was 8TiB (well, one byte less), but the actual true VM limit is one page less than 16TiB. This was invisible until commit c2a9737f45e2 ("vfs,mm: fix a dead loop in truncate_inode_pages_range()"), which started applying that MAX_LFS_FILESIZE limit to block devices too. NOTE! On 64-bit, the page index isn't a limiter at all, and the limit is actually just the offset type itself (loff_t), which is signed. But for clarity, on 64-bit, just use the maximum signed value, and don't make people have to count the number of 'f' characters in the hex constant. So just use LLONG_MAX for the 64-bit case. That was what the value had been before too, just written out as a hex constant. Fixes: c2a9737f45e2 ("vfs,mm: fix a dead loop in truncate_inode_pages_range()") Reported-and-tested-by: Doug Nazar Cc: Andreas Dilger Cc: Mark Fasheh Cc: Joel Becker Cc: Dave Kleikamp Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e1fd5d21248..cbfe127bccf8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -907,9 +907,9 @@ static inline struct file *get_file(struct file *f) /* Page cache limit. The filesystems should put that into their s_maxbytes limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 -#define MAX_LFS_FILESIZE (((loff_t)PAGE_SIZE << (BITS_PER_LONG-1))-1) +#define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT) #elif BITS_PER_LONG==64 -#define MAX_LFS_FILESIZE ((loff_t)0x7fffffffffffffffLL) +#define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif #define FL_POSIX 1 -- cgit v1.2.3-71-gd317