From afaef01c001537fa97a25092d7f54d764dc7d8c1 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Fri, 17 Aug 2018 01:16:58 +0300 Subject: x86/entry: Add STACKLEAK erasing the kernel stack at the end of syscalls The STACKLEAK feature (initially developed by PaX Team) has the following benefits: 1. Reduces the information that can be revealed through kernel stack leak bugs. The idea of erasing the thread stack at the end of syscalls is similar to CONFIG_PAGE_POISONING and memzero_explicit() in kernel crypto, which all comply with FDP_RIP.2 (Full Residual Information Protection) of the Common Criteria standard. 2. Blocks some uninitialized stack variable attacks (e.g. CVE-2017-17712, CVE-2010-2963). That kind of bugs should be killed by improving C compilers in future, which might take a long time. This commit introduces the code filling the used part of the kernel stack with a poison value before returning to userspace. Full STACKLEAK feature also contains the gcc plugin which comes in a separate commit. The STACKLEAK feature is ported from grsecurity/PaX. More information at: https://grsecurity.net/ https://pax.grsecurity.net/ This code is modified from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on our understanding of the code. Changes or omissions from the original code are ours and don't reflect the original grsecurity/PaX code. Performance impact: Hardware: Intel Core i7-4770, 16 GB RAM Test #1: building the Linux kernel on a single core 0.91% slowdown Test #2: hackbench -s 4096 -l 2000 -g 15 -f 25 -P 4.2% slowdown So the STACKLEAK description in Kconfig includes: "The tradeoff is the performance impact: on a single CPU system kernel compilation sees a 1% slowdown, other systems and workloads may vary and you are advised to test this feature on your expected workload before deploying it". Signed-off-by: Alexander Popov Acked-by: Thomas Gleixner Reviewed-by: Dave Hansen Acked-by: Ingo Molnar Signed-off-by: Kees Cook --- include/linux/sched.h | 4 ++++ include/linux/stackleak.h | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 include/linux/stackleak.h (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 977cb57d7bc9..c1a23acd24e7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1192,6 +1192,10 @@ struct task_struct { void *security; #endif +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + unsigned long lowest_stack; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h new file mode 100644 index 000000000000..628c2b947b89 --- /dev/null +++ b/include/linux/stackleak.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_STACKLEAK_H +#define _LINUX_STACKLEAK_H + +#include +#include + +/* + * Check that the poison value points to the unused hole in the + * virtual memory map for your platform. + */ +#define STACKLEAK_POISON -0xBEEF +#define STACKLEAK_SEARCH_DEPTH 128 + +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#include + +static inline void stackleak_task_init(struct task_struct *t) +{ + t->lowest_stack = (unsigned long)end_of_stack(t) + sizeof(unsigned long); +} +#else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ +static inline void stackleak_task_init(struct task_struct *t) { } +#endif + +#endif -- cgit v1.2.3-71-gd317 From c8d126275a5fa59394fe17109bdb9812fed296b8 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Fri, 17 Aug 2018 01:17:01 +0300 Subject: fs/proc: Show STACKLEAK metrics in the /proc file system Introduce CONFIG_STACKLEAK_METRICS providing STACKLEAK information about tasks via the /proc file system. In particular, /proc//stack_depth shows the maximum kernel stack consumption for the current and previous syscalls. Although this information is not precise, it can be useful for estimating the STACKLEAK performance impact for your workloads. Suggested-by: Ingo Molnar Signed-off-by: Alexander Popov Tested-by: Laura Abbott Signed-off-by: Kees Cook --- fs/proc/base.c | 18 ++++++++++++++++++ include/linux/sched.h | 1 + include/linux/stackleak.h | 3 +++ kernel/stackleak.c | 4 ++++ scripts/gcc-plugins/Kconfig | 12 ++++++++++++ 5 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index ccf86f16d9f0..2a238d68610e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2891,6 +2891,21 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_LIVEPATCH */ +#ifdef CONFIG_STACKLEAK_METRICS +static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long prev_depth = THREAD_SIZE - + (task->prev_lowest_stack & (THREAD_SIZE - 1)); + unsigned long depth = THREAD_SIZE - + (task->lowest_stack & (THREAD_SIZE - 1)); + + seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n", + prev_depth, depth); + return 0; +} +#endif /* CONFIG_STACKLEAK_METRICS */ + /* * Thread groups */ @@ -2992,6 +3007,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_STACKLEAK_METRICS + ONE("stack_depth", S_IRUGO, proc_stack_depth), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/sched.h b/include/linux/sched.h index c1a23acd24e7..ae9d10e14b82 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1194,6 +1194,7 @@ struct task_struct { #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; + unsigned long prev_lowest_stack; #endif /* diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index 628c2b947b89..b911b973d328 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -18,6 +18,9 @@ static inline void stackleak_task_init(struct task_struct *t) { t->lowest_stack = (unsigned long)end_of_stack(t) + sizeof(unsigned long); +# ifdef CONFIG_STACKLEAK_METRICS + t->prev_lowest_stack = t->lowest_stack; +# endif } #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ static inline void stackleak_task_init(struct task_struct *t) { } diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 628485db37ba..f66239572c89 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -41,6 +41,10 @@ asmlinkage void stackleak_erase(void) if (kstack_ptr == boundary) kstack_ptr += sizeof(unsigned long); +#ifdef CONFIG_STACKLEAK_METRICS + current->prev_lowest_stack = kstack_ptr; +#endif + /* * Now write the poison value to the kernel stack. Start from * 'kstack_ptr' and move up till the new 'boundary'. We assume that diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index c65fdd823591..b0a015ef5268 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -170,4 +170,16 @@ config STACKLEAK_TRACK_MIN_SIZE a stack frame size greater than or equal to this parameter. If unsure, leave the default value 100. +config STACKLEAK_METRICS + bool "Show STACKLEAK metrics in the /proc file system" + depends on GCC_PLUGIN_STACKLEAK + depends on PROC_FS + help + If this is set, STACKLEAK metrics for every task are available in + the /proc file system. In particular, /proc//stack_depth + shows the maximum kernel stack consumption for the current and + previous syscalls. Although this information is not precise, it + can be useful for estimating the STACKLEAK performance impact for + your workloads. + endif -- cgit v1.2.3-71-gd317 From 964c9dff0091893a9a74a88edf984c6da0b779f7 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Fri, 17 Aug 2018 01:17:03 +0300 Subject: stackleak: Allow runtime disabling of kernel stack erasing Introduce CONFIG_STACKLEAK_RUNTIME_DISABLE option, which provides 'stack_erasing' sysctl. It can be used in runtime to control kernel stack erasing for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK. Suggested-by: Ingo Molnar Signed-off-by: Alexander Popov Tested-by: Laura Abbott Signed-off-by: Kees Cook --- Documentation/sysctl/kernel.txt | 18 ++++++++++++++++++ include/linux/stackleak.h | 6 ++++++ kernel/stackleak.c | 38 ++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 15 ++++++++++++++- scripts/gcc-plugins/Kconfig | 8 ++++++++ 5 files changed, 84 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 37a679501ddc..1b8775298cf7 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -89,6 +89,7 @@ show up in /proc/sys/kernel: - shmmni - softlockup_all_cpu_backtrace - soft_watchdog +- stack_erasing - stop-a [ SPARC only ] - sysrq ==> Documentation/admin-guide/sysrq.rst - sysctl_writes_strict @@ -987,6 +988,23 @@ detect a hard lockup condition. ============================================================== +stack_erasing + +This parameter can be used to control kernel stack erasing at the end +of syscalls for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK. + +That erasing reduces the information which kernel stack leak bugs +can reveal and blocks some uninitialized stack variable attacks. +The tradeoff is the performance impact: on a single CPU system kernel +compilation sees a 1% slowdown, other systems and workloads may vary. + + 0: kernel stack erasing is disabled, STACKLEAK_METRICS are not updated. + + 1: kernel stack erasing is enabled (default), it is performed before + returning to the userspace at the end of syscalls. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which can be diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index b911b973d328..3d5c3271a9a8 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -22,6 +22,12 @@ static inline void stackleak_task_init(struct task_struct *t) t->prev_lowest_stack = t->lowest_stack; # endif } + +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ static inline void stackleak_task_init(struct task_struct *t) { } #endif diff --git a/kernel/stackleak.c b/kernel/stackleak.c index f66239572c89..e42892926244 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -12,6 +12,41 @@ #include +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +#include +#include + +static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); + +int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + int state = !static_branch_unlikely(&stack_erasing_bypass); + int prev_state = state; + + table->data = &state; + table->maxlen = sizeof(int); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + state = !!state; + if (ret || !write || state == prev_state) + return ret; + + if (state) + static_branch_disable(&stack_erasing_bypass); + else + static_branch_enable(&stack_erasing_bypass); + + pr_warn("stackleak: kernel stack erasing is %s\n", + state ? "enabled" : "disabled"); + return ret; +} + +#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) +#else +#define skip_erasing() false +#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ + asmlinkage void stackleak_erase(void) { /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ @@ -20,6 +55,9 @@ asmlinkage void stackleak_erase(void) unsigned int poison_count = 0; const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long); + if (skip_erasing()) + return; + /* Check that 'lowest_stack' value is sane */ if (unlikely(kstack_ptr - boundary >= THREAD_SIZE)) kstack_ptr = boundary; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc02050fd0c4..3ae223f7b5df 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -91,7 +91,9 @@ #ifdef CONFIG_CHR_DEV_SG #include #endif - +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE +#include +#endif #ifdef CONFIG_LOCKUP_DETECTOR #include #endif @@ -1232,6 +1234,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#endif +#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index b0a015ef5268..0d5c799688f0 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig @@ -182,4 +182,12 @@ config STACKLEAK_METRICS can be useful for estimating the STACKLEAK performance impact for your workloads. +config STACKLEAK_RUNTIME_DISABLE + bool "Allow runtime disabling of kernel stack erasing" + depends on GCC_PLUGIN_STACKLEAK + help + This option provides 'stack_erasing' sysctl, which can be used in + runtime to control kernel stack erasing for kernels built with + CONFIG_GCC_PLUGIN_STACKLEAK. + endif -- cgit v1.2.3-71-gd317 From 29efbc6aea9d9bd9aa9870a9afc1882046303cf9 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 30 Aug 2018 19:07:27 +0200 Subject: Compiler Attributes: remove unused attributes __optimize and __deprecate_for_modules are unused in the whole kernel tree. Simply drop them. Tested-by: Sedat Dilek # on top of v4.19-rc5, clang 7 Reviewed-by: Nick Desaulniers Reviewed-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler-gcc.h | 2 -- include/linux/compiler.h | 4 ---- include/linux/compiler_types.h | 1 - 3 files changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 4d36b27214fd..1302b425e625 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -81,8 +81,6 @@ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) -#define __optimize(level) __attribute__((__optimize__(level))) - #define __compiletime_object_size(obj) __builtin_object_size(obj, 0) #ifndef __CHECKER__ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 681d866efb1e..7c0157d50964 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -301,10 +301,6 @@ static inline void *offset_to_ptr(const int *off) #endif /* __ASSEMBLY__ */ -#ifndef __optimize -# define __optimize(level) -#endif - /* Compile time object size, -1 for unknown */ #ifndef __compiletime_object_size # define __compiletime_object_size(obj) -1 diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index db192becfec4..a19562cb047c 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -108,7 +108,6 @@ struct ftrace_likely_data { /* Don't. Just don't. */ #define __deprecated -#define __deprecated_for_modules #endif /* __KERNEL__ */ -- cgit v1.2.3-71-gd317 From 5c67a52f3da0f0d22764f2daec417702695a8112 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 30 Aug 2018 19:13:37 +0200 Subject: Compiler Attributes: always use the extra-underscores syntax The attribute syntax optionally allows to surround attribute names with "__" in order to avoid collisions with macros of the same name (see https://gcc.gnu.org/onlinedocs/gcc/Attribute-Syntax.html). This homogenizes all attributes to use the syntax with underscores. While there are currently only a handful of cases of some TUs defining macros like "error" which may collide with the attributes, this should prevent futures surprises. This has been done only for "standard" attributes supported by the major compilers. In other words, those of third-party tools (e.g. sparse, plugins...) have not been changed for the moment. Tested-by: Sedat Dilek # on top of v4.19-rc5, clang 7 Reviewed-by: Nick Desaulniers Reviewed-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler-clang.h | 2 +- include/linux/compiler-gcc.h | 12 ++++++------ include/linux/compiler-intel.h | 2 +- include/linux/compiler.h | 8 ++++---- include/linux/compiler_types.h | 42 +++++++++++++++++++++--------------------- 5 files changed, 33 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index b1ce500fe8b3..d11cad61ba5c 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -21,7 +21,7 @@ #define __SANITIZE_ADDRESS__ #endif -#define __no_sanitize_address __attribute__((no_sanitize("address"))) +#define __no_sanitize_address __attribute__((__no_sanitize__("address"))) /* * Not all versions of clang implement the the type-generic versions diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 1302b425e625..7a1de7683df5 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -76,7 +76,7 @@ #endif #ifdef RETPOLINE -#define __noretpoline __attribute__((indirect_branch("keep"))) +#define __noretpoline __attribute__((__indirect_branch__("keep"))) #endif #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) @@ -84,8 +84,8 @@ #define __compiletime_object_size(obj) __builtin_object_size(obj, 0) #ifndef __CHECKER__ -#define __compiletime_warning(message) __attribute__((warning(message))) -#define __compiletime_error(message) __attribute__((error(message))) +#define __compiletime_warning(message) __attribute__((__warning__(message))) +#define __compiletime_error(message) __attribute__((__error__(message))) #ifdef LATENT_ENTROPY_PLUGIN #define __latent_entropy __attribute__((latent_entropy)) @@ -134,7 +134,7 @@ * optimizer that something else uses this function or variable, thus preventing * this. */ -#define __visible __attribute__((externally_visible)) +#define __visible __attribute__((__externally_visible__)) /* gcc version specific checks */ @@ -191,7 +191,7 @@ * should not be applied to that function. * Conflicts with inlining: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368 */ -#define __no_sanitize_address __attribute__((no_sanitize_address)) +#define __no_sanitize_address __attribute__((__no_sanitize_address__)) #endif #if GCC_VERSION >= 50100 @@ -199,7 +199,7 @@ * Mark structures as requiring designated initializers. * https://gcc.gnu.org/onlinedocs/gcc/Designated-Inits.html */ -#define __designated_init __attribute__((designated_init)) +#define __designated_init __attribute__((__designated_init__)) #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h index 4c7f9befa9f6..fef8bb3e53ef 100644 --- a/include/linux/compiler-intel.h +++ b/include/linux/compiler-intel.h @@ -42,4 +42,4 @@ * and may be redefined here because they should not be shared with other * compilers, like clang. */ -#define __visible __attribute__((externally_visible)) +#define __visible __attribute__((__externally_visible__)) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 7c0157d50964..ec4a28bad2c6 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -24,7 +24,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, long ______r; \ static struct ftrace_likely_data \ __attribute__((__aligned__(4))) \ - __attribute__((section("_ftrace_annotated_branch"))) \ + __attribute__((__section__("_ftrace_annotated_branch"))) \ ______f = { \ .data.func = __func__, \ .data.file = __FILE__, \ @@ -60,7 +60,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, int ______r; \ static struct ftrace_branch_data \ __attribute__((__aligned__(4))) \ - __attribute__((section("_ftrace_branch"))) \ + __attribute__((__section__("_ftrace_branch"))) \ ______f = { \ .func = __func__, \ .file = __FILE__, \ @@ -146,7 +146,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, extern typeof(sym) sym; \ static const unsigned long __kentry_##sym \ __used \ - __attribute__((section("___kentry" "+" #sym ), used)) \ + __attribute__((__section__("___kentry" "+" #sym ), used)) \ = (unsigned long)&sym; #endif @@ -287,7 +287,7 @@ unsigned long read_word_at_a_time(const void *addr) * visible to the compiler. */ #define __ADDRESSABLE(sym) \ - static void * __attribute__((section(".discard.addressable"), used)) \ + static void * __attribute__((__section__(".discard.addressable"), used)) \ __PASTE(__addressable_##sym, __LINE__) = (void *)&sym; /** diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index a19562cb047c..8fbdd47dd3d0 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -195,26 +195,26 @@ struct ftrace_likely_data { * would be. * [...] */ -#define __pure __attribute__((pure)) -#define __aligned(x) __attribute__((aligned(x))) -#define __aligned_largest __attribute__((aligned)) -#define __printf(a, b) __attribute__((format(printf, a, b))) -#define __scanf(a, b) __attribute__((format(scanf, a, b))) -#define __maybe_unused __attribute__((unused)) -#define __always_unused __attribute__((unused)) -#define __mode(x) __attribute__((mode(x))) +#define __pure __attribute__((__pure__)) +#define __aligned(x) __attribute__((__aligned__(x))) +#define __aligned_largest __attribute__((__aligned__)) +#define __printf(a, b) __attribute__((__format__(printf, a, b))) +#define __scanf(a, b) __attribute__((__format__(scanf, a, b))) +#define __maybe_unused __attribute__((__unused__)) +#define __always_unused __attribute__((__unused__)) +#define __mode(x) __attribute__((__mode__(x))) #define __malloc __attribute__((__malloc__)) #define __used __attribute__((__used__)) -#define __noreturn __attribute__((noreturn)) -#define __packed __attribute__((packed)) -#define __weak __attribute__((weak)) -#define __alias(symbol) __attribute__((alias(#symbol))) -#define __cold __attribute__((cold)) +#define __noreturn __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __weak __attribute__((__weak__)) +#define __alias(symbol) __attribute__((__alias__(#symbol))) +#define __cold __attribute__((__cold__)) #define __section(S) __attribute__((__section__(#S))) #ifdef CONFIG_ENABLE_MUST_CHECK -#define __must_check __attribute__((warn_unused_result)) +#define __must_check __attribute__((__warn_unused_result__)) #else #define __must_check #endif @@ -222,7 +222,7 @@ struct ftrace_likely_data { #if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__) #define notrace __attribute__((hotpatch(0, 0))) #else -#define notrace __attribute__((no_instrument_function)) +#define notrace __attribute__((__no_instrument_function__)) #endif /* @@ -231,7 +231,7 @@ struct ftrace_likely_data { * stack and frame pointer being set up and there is no chance to * restore the lr register to the value before mcount was called. */ -#define __naked __attribute__((naked)) notrace +#define __naked __attribute__((__naked__)) notrace #define __compiler_offsetof(a, b) __builtin_offsetof(a, b) @@ -242,7 +242,7 @@ struct ftrace_likely_data { * defined so the gnu89 semantics are the default. */ #ifdef __GNUC_STDC_INLINE__ -# define __gnu_inline __attribute__((gnu_inline)) +# define __gnu_inline __attribute__((__gnu_inline__)) #else # define __gnu_inline #endif @@ -262,17 +262,17 @@ struct ftrace_likely_data { #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) #define inline \ - inline __attribute__((always_inline, unused)) notrace __gnu_inline + inline __attribute__((__always_inline__, __unused__)) notrace __gnu_inline #else -#define inline inline __attribute__((unused)) notrace __gnu_inline +#define inline inline __attribute__((__unused__)) notrace __gnu_inline #endif #define __inline__ inline #define __inline inline -#define noinline __attribute__((noinline)) +#define noinline __attribute__((__noinline__)) #ifndef __always_inline -#define __always_inline inline __attribute__((always_inline)) +#define __always_inline inline __attribute__((__always_inline__)) #endif /* -- cgit v1.2.3-71-gd317 From c2c640aa04cc4e6caf0ff17ff18b3784e0c99566 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 30 Aug 2018 19:45:01 +0200 Subject: Compiler Attributes: remove unneeded tests Attributes const and always_inline have tests around them which are unneeded, since they are supported by gcc >= 4.6, clang >= 3 and icc >= 13. https://godbolt.org/z/DFPq37 In the case of gnu_inline, we do not need to test for __GNUC_STDC_INLINE__ because, regardless of the current inlining behavior, we can simply always force the old GCC inlining behavior by using the attribute in all cases. Tested-by: Sedat Dilek # on top of v4.19-rc5, clang 7 Reviewed-by: Nick Desaulniers Reviewed-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler_types.h | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 8fbdd47dd3d0..5ff9cda893f4 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -158,10 +158,6 @@ struct ftrace_likely_data { (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) -#ifndef __attribute_const__ -#define __attribute_const__ __attribute__((__const__)) -#endif - #ifndef __noclone #define __noclone #endif @@ -196,6 +192,7 @@ struct ftrace_likely_data { * [...] */ #define __pure __attribute__((__pure__)) +#define __attribute_const__ __attribute__((__const__)) #define __aligned(x) __attribute__((__aligned__(x))) #define __aligned_largest __attribute__((__aligned__)) #define __printf(a, b) __attribute__((__format__(printf, a, b))) @@ -211,6 +208,8 @@ struct ftrace_likely_data { #define __alias(symbol) __attribute__((__alias__(#symbol))) #define __cold __attribute__((__cold__)) #define __section(S) __attribute__((__section__(#S))) +#define __always_inline inline __attribute__((__always_inline__)) +#define __gnu_inline __attribute__((__gnu_inline__)) #ifdef CONFIG_ENABLE_MUST_CHECK @@ -235,18 +234,6 @@ struct ftrace_likely_data { #define __compiler_offsetof(a, b) __builtin_offsetof(a, b) -/* - * Feature detection for gnu_inline (gnu89 extern inline semantics). Either - * __GNUC_STDC_INLINE__ is defined (not using gnu89 extern inline semantics, - * and we opt in to the gnu89 semantics), or __GNUC_STDC_INLINE__ is not - * defined so the gnu89 semantics are the default. - */ -#ifdef __GNUC_STDC_INLINE__ -# define __gnu_inline __attribute__((__gnu_inline__)) -#else -# define __gnu_inline -#endif - /* * Force always-inline if the user requests it so via the .config. * GCC does not warn about unused static inline functions for @@ -271,10 +258,6 @@ struct ftrace_likely_data { #define __inline inline #define noinline __attribute__((__noinline__)) -#ifndef __always_inline -#define __always_inline inline __attribute__((__always_inline__)) -#endif - /* * Rather then using noinline to prevent stack consumption, use * noinline_for_stack instead. For documentation reasons. -- cgit v1.2.3-71-gd317 From ec0bbef66f867854691d5af18c2231d746958e0e Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 30 Aug 2018 19:25:14 +0200 Subject: Compiler Attributes: homogenize __must_be_array Different definitions of __must_be_array: * gcc: disabled for __CHECKER__ * clang: same definition as gcc's, but without __CHECKER__ * intel: the comment claims __builtin_types_compatible_p() is unsupported; but icc seems to support it since 13.0.1 (released in 2012). See https://godbolt.org/z/S0l6QQ Therefore, we can remove all of them and have a single definition in compiler.h Tested-by: Sedat Dilek # on top of v4.19-rc5, clang 7 Reviewed-by: Nick Desaulniers Reviewed-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler-clang.h | 1 - include/linux/compiler-gcc.h | 7 ------- include/linux/compiler-intel.h | 3 --- include/linux/compiler.h | 7 +++++++ 4 files changed, 7 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index d11cad61ba5c..fa9532f8d885 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -41,6 +41,5 @@ * compilers, like ICC. */ #define barrier() __asm__ __volatile__("" : : : "memory") -#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) #define __assume_aligned(a, ...) \ __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7a1de7683df5..3b32bbfa5a49 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -68,13 +68,6 @@ */ #define uninitialized_var(x) x = x -#ifdef __CHECKER__ -#define __must_be_array(a) 0 -#else -/* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) -#endif - #ifdef RETPOLINE #define __noretpoline __attribute__((__indirect_branch__("keep"))) #endif diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h index fef8bb3e53ef..6004b4588bd4 100644 --- a/include/linux/compiler-intel.h +++ b/include/linux/compiler-intel.h @@ -29,9 +29,6 @@ */ #define OPTIMIZER_HIDE_VAR(var) barrier() -/* Intel ECC compiler doesn't support __builtin_types_compatible_p() */ -#define __must_be_array(a) 0 - #endif /* icc has this, but it's called _bswap16 */ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ec4a28bad2c6..165b1d5683ed 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -357,4 +357,11 @@ static inline void *offset_to_ptr(const int *off) compiletime_assert(__native_word(t), \ "Need native word sized stores/loads for atomicity.") +#ifdef __CHECKER__ +#define __must_be_array(a) 0 +#else +/* &a[0] degrades to a pointer: a different type from an array */ +#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +#endif + #endif /* __LINUX_COMPILER_H */ -- cgit v1.2.3-71-gd317 From 989bd5000f36052df604888ed12bb6ef390786b7 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 31 Aug 2018 18:00:16 +0200 Subject: Compiler Attributes: remove unneeded sparse (__CHECKER__) tests Sparse knows about a few more attributes now, so we can remove the __CHECKER__ conditions from them (which, in turn, allow us to move some of them later on to compiler_attributes.h). * assume_aligned: since sparse's commit ffc860b ("sparse: ignore __assume_aligned__ attribute"), included in 0.5.1 * error: since sparse's commit 0a04210 ("sparse: Add 'error' to ignored attributes"), included in 0.5.0 * hotpatch: since sparse's commit 6043210 ("sparse/parse.c: ignore hotpatch attribute"), included in 0.5.1 * warning: since sparse's commit 977365d ("Avoid "attribute 'warning': unknown attribute" warning"), included in 0.4.2 On top of that, __must_be_array does not need it either because: * Even ancient versions of sparse do not have a problem * BUILD_BUG_ON_ZERO() is currently disabled for __CHECKER__ Tested-by: Sedat Dilek # on top of v4.19-rc5, clang 7 Reviewed-by: Nick Desaulniers Reviewed-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler-gcc.h | 6 ++---- include/linux/compiler.h | 4 ---- include/linux/compiler_types.h | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 3b32bbfa5a49..1ca6a51cfaa9 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -76,14 +76,12 @@ #define __compiletime_object_size(obj) __builtin_object_size(obj, 0) -#ifndef __CHECKER__ #define __compiletime_warning(message) __attribute__((__warning__(message))) #define __compiletime_error(message) __attribute__((__error__(message))) -#ifdef LATENT_ENTROPY_PLUGIN +#if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) #define __latent_entropy __attribute__((latent_entropy)) #endif -#endif /* __CHECKER__ */ /* * calling noreturn functions, __builtin_unreachable() and __builtin_trap() @@ -131,7 +129,7 @@ /* gcc version specific checks */ -#if GCC_VERSION >= 40900 && !defined(__CHECKER__) +#if GCC_VERSION >= 40900 /* * __assume_aligned(n, k): Tell the optimizer that the returned * pointer can be assumed to be k modulo n. The second argument is diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 165b1d5683ed..4030a2940d6b 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -357,11 +357,7 @@ static inline void *offset_to_ptr(const int *off) compiletime_assert(__native_word(t), \ "Need native word sized stores/loads for atomicity.") -#ifdef __CHECKER__ -#define __must_be_array(a) 0 -#else /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) -#endif #endif /* __LINUX_COMPILER_H */ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 5ff9cda893f4..a3eceb3ad1b3 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -218,7 +218,7 @@ struct ftrace_likely_data { #define __must_check #endif -#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__) +#if defined(CC_USING_HOTPATCH) #define notrace __attribute__((hotpatch(0, 0))) #else #define notrace __attribute__((__no_instrument_function__)) -- cgit v1.2.3-71-gd317 From 66dbeef915f275dad6c8656b31667ef9640f5639 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 3 Sep 2018 18:34:18 +0200 Subject: Compiler Attributes: add missing SPDX ID in compiler_types.h Tested-by: Sedat Dilek # on top of v4.19-rc5, clang 7 Reviewed-by: Nick Desaulniers Reviewed-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index a3eceb3ad1b3..ed7c0e4a180e 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COMPILER_TYPES_H #define __LINUX_COMPILER_TYPES_H -- cgit v1.2.3-71-gd317 From a3f8a30f3f0079c7edfc72e329eee8594fb3e3cb Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 30 Aug 2018 20:36:59 +0200 Subject: Compiler Attributes: use feature checks instead of version checks Instead of using version checks per-compiler to define (or not) each attribute, use __has_attribute to test for them, following the cleanup started with commit 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive"), which is supported on gcc >= 5, clang >= 2.9 and icc >= 17. In the meantime, to support 4.6 <= gcc < 5, we implement __has_attribute by hand. All the attributes that can be unconditionally defined and directly map to compiler attribute(s) (even if optional) have been moved to a new file include/linux/compiler_attributes.h In an effort to make the file as regular as possible, comments stating the purpose of attributes have been removed. Instead, links to the compiler docs have been added (i.e. to gcc and, if available, to clang as well). In addition, they have been sorted. Finally, if an attribute is optional (i.e. if it is guarded by __has_attribute), the reason has been stated for future reference. Tested-by: Sedat Dilek # on top of v4.19-rc5, clang 7 Reviewed-by: Nick Desaulniers Reviewed-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler-clang.h | 4 - include/linux/compiler-gcc.h | 51 -------- include/linux/compiler-intel.h | 6 - include/linux/compiler_attributes.h | 244 ++++++++++++++++++++++++++++++++++++ include/linux/compiler_types.h | 74 ++--------- 5 files changed, 254 insertions(+), 125 deletions(-) create mode 100644 include/linux/compiler_attributes.h (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index fa9532f8d885..3e7dafb3ea80 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -21,8 +21,6 @@ #define __SANITIZE_ADDRESS__ #endif -#define __no_sanitize_address __attribute__((__no_sanitize__("address"))) - /* * Not all versions of clang implement the the type-generic versions * of the builtin overflow checkers. Fortunately, clang implements @@ -41,5 +39,3 @@ * compilers, like ICC. */ #define barrier() __asm__ __volatile__("" : : : "memory") -#define __assume_aligned(a, ...) \ - __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 1ca6a51cfaa9..cfac027e1625 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -108,9 +108,6 @@ __builtin_unreachable(); \ } while (0) -/* Mark a function definition as prohibited from being cloned. */ -#define __noclone __attribute__((__noclone__, __optimize__("no-tracer"))) - #if defined(RANDSTRUCT_PLUGIN) && !defined(__CHECKER__) #define __randomize_layout __attribute__((randomize_layout)) #define __no_randomize_layout __attribute__((no_randomize_layout)) @@ -119,32 +116,6 @@ #define randomized_struct_fields_end } __randomize_layout; #endif -/* - * When used with Link Time Optimization, gcc can optimize away C functions or - * variables which are referenced only from assembly code. __visible tells the - * optimizer that something else uses this function or variable, thus preventing - * this. - */ -#define __visible __attribute__((__externally_visible__)) - -/* gcc version specific checks */ - -#if GCC_VERSION >= 40900 -/* - * __assume_aligned(n, k): Tell the optimizer that the returned - * pointer can be assumed to be k modulo n. The second argument is - * optional (default 0), so we use a variadic macro to make the - * shorthand. - * - * Beware: Do not apply this to functions which may return - * ERR_PTRs. Also, it is probably unwise to apply it to functions - * returning extra information in the low bits (but in that case the - * compiler should see some alignment anyway, when the return value is - * massaged by 'flags = ptr & 3; ptr &= ~3;'). - */ -#define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) -#endif - /* * GCC 'asm goto' miscompiles certain code sequences: * @@ -176,32 +147,10 @@ #define KASAN_ABI_VERSION 3 #endif -#if GCC_VERSION >= 40902 -/* - * Tell the compiler that address safety instrumentation (KASAN) - * should not be applied to that function. - * Conflicts with inlining: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368 - */ -#define __no_sanitize_address __attribute__((__no_sanitize_address__)) -#endif - #if GCC_VERSION >= 50100 -/* - * Mark structures as requiring designated initializers. - * https://gcc.gnu.org/onlinedocs/gcc/Designated-Inits.html - */ -#define __designated_init __attribute__((__designated_init__)) #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif -#if !defined(__noclone) -#define __noclone /* not needed */ -#endif - -#if !defined(__no_sanitize_address) -#define __no_sanitize_address -#endif - /* * Turn individual warnings and errors on and off locally, depending * on version. diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h index 6004b4588bd4..517bd14e1222 100644 --- a/include/linux/compiler-intel.h +++ b/include/linux/compiler-intel.h @@ -34,9 +34,3 @@ /* icc has this, but it's called _bswap16 */ #define __HAVE_BUILTIN_BSWAP16__ #define __builtin_bswap16 _bswap16 - -/* The following are for compatibility with GCC, from compiler-gcc.h, - * and may be redefined here because they should not be shared with other - * compilers, like clang. - */ -#define __visible __attribute__((__externally_visible__)) diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h new file mode 100644 index 000000000000..f0f9fc398440 --- /dev/null +++ b/include/linux/compiler_attributes.h @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_COMPILER_ATTRIBUTES_H +#define __LINUX_COMPILER_ATTRIBUTES_H + +/* + * The attributes in this file are unconditionally defined and they directly + * map to compiler attribute(s) -- except those that are optional. + * + * Any other "attributes" (i.e. those that depend on a configuration option, + * on a compiler, on an architecture, on plugins, on other attributes...) + * should be defined elsewhere (e.g. compiler_types.h or compiler-*.h). + * + * This file is meant to be sorted (by actual attribute name, + * not by #define identifier). Use the __attribute__((__name__)) syntax + * (i.e. with underscores) to avoid future collisions with other macros. + * If an attribute is optional, state the reason in the comment. + */ + +/* + * To check for optional attributes, we use __has_attribute, which is supported + * on gcc >= 5, clang >= 2.9 and icc >= 17. In the meantime, to support + * 4.6 <= gcc < 5, we implement __has_attribute by hand. + * + * sparse does not support __has_attribute (yet) and defines __GNUC_MINOR__ + * depending on the compiler used to build it; however, these attributes have + * no semantic effects for sparse, so it does not matter. Also note that, + * in order to avoid sparse's warnings, even the unsupported ones must be + * defined to 0. + */ +#ifndef __has_attribute +# define __has_attribute(x) __GCC4_has_attribute_##x +# define __GCC4_has_attribute___assume_aligned__ (__GNUC_MINOR__ >= 9) +# define __GCC4_has_attribute___designated_init__ 0 +# define __GCC4_has_attribute___externally_visible__ 1 +# define __GCC4_has_attribute___noclone__ 1 +# define __GCC4_has_attribute___optimize__ 1 +# define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) +#endif + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alias-function-attribute + */ +#define __alias(symbol) __attribute__((__alias__(#symbol))) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-aligned-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-aligned-type-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-aligned-variable-attribute + */ +#define __aligned(x) __attribute__((__aligned__(x))) +#define __aligned_largest __attribute__((__aligned__)) + +/* + * Note: users of __always_inline currently do not write "inline" themselves, + * which seems to be required by gcc to apply the attribute according + * to its docs (and also "warning: always_inline function might not be + * inlinable [-Wattributes]" is emitted). + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-always_005finline-function-attribute + * clang: mentioned + */ +#define __always_inline inline __attribute__((__always_inline__)) + +/* + * The second argument is optional (default 0), so we use a variadic macro + * to make the shorthand. + * + * Beware: Do not apply this to functions which may return + * ERR_PTRs. Also, it is probably unwise to apply it to functions + * returning extra information in the low bits (but in that case the + * compiler should see some alignment anyway, when the return value is + * massaged by 'flags = ptr & 3; ptr &= ~3;'). + * + * Optional: only supported since gcc >= 4.9 + * Optional: not supported by icc + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-assume_005faligned-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#assume-aligned + */ +#if __has_attribute(__assume_aligned__) +# define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) +#else +# define __assume_aligned(a, ...) +#endif + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-cold-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Label-Attributes.html#index-cold-label-attribute + */ +#define __cold __attribute__((__cold__)) + +/* + * Note the long name. + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-const-function-attribute + */ +#define __attribute_const__ __attribute__((__const__)) + +/* + * Don't. Just don't. See commit 771c035372a0 ("deprecate the '__deprecated' + * attribute warnings entirely and for good") for more information. + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-deprecated-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-deprecated-type-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-deprecated-variable-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Enumerator-Attributes.html#index-deprecated-enumerator-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#deprecated + */ +#define __deprecated + +/* + * Optional: only supported since gcc >= 5.1 + * Optional: not supported by clang + * Optional: not supported by icc + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-designated_005finit-type-attribute + */ +#if __has_attribute(__designated_init__) +# define __designated_init __attribute__((__designated_init__)) +#else +# define __designated_init +#endif + +/* + * Optional: not supported by clang + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-externally_005fvisible-function-attribute + */ +#if __has_attribute(__externally_visible__) +# define __visible __attribute__((__externally_visible__)) +#else +# define __visible +#endif + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-format-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#format + */ +#define __printf(a, b) __attribute__((__format__(printf, a, b))) +#define __scanf(a, b) __attribute__((__format__(scanf, a, b))) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-gnu_005finline-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#gnu-inline + */ +#define __gnu_inline __attribute__((__gnu_inline__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-malloc-function-attribute + */ +#define __malloc __attribute__((__malloc__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-mode-type-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-mode-variable-attribute + */ +#define __mode(x) __attribute__((__mode__(x))) + +/* + * Optional: not supported by clang + * Note: icc does not recognize gcc's no-tracer + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noclone-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-optimize-function-attribute + */ +#if __has_attribute(__noclone__) +# if __has_attribute(__optimize__) +# define __noclone __attribute__((__noclone__, __optimize__("no-tracer"))) +# else +# define __noclone __attribute__((__noclone__)) +# endif +#else +# define __noclone +#endif + +/* + * Note the missing underscores. + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noinline-function-attribute + * clang: mentioned + */ +#define noinline __attribute__((__noinline__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noreturn-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#noreturn + * clang: https://clang.llvm.org/docs/AttributeReference.html#id1 + */ +#define __noreturn __attribute__((__noreturn__)) + +/* + * Optional: only supported since gcc >= 4.8 + * Optional: not supported by icc + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-no_005fsanitize_005faddress-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#no-sanitize-address-no-address-safety-analysis + */ +#if __has_attribute(__no_sanitize_address__) +# define __no_sanitize_address __attribute__((__no_sanitize_address__)) +#else +# define __no_sanitize_address +#endif + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-packed-type-attribute + * clang: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-packed-variable-attribute + */ +#define __packed __attribute__((__packed__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-pure-function-attribute + */ +#define __pure __attribute__((__pure__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-section-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-section-variable-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate + */ +#define __section(S) __attribute__((__section__(#S))) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-unused-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-unused-type-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-unused-variable-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Label-Attributes.html#index-unused-label-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#maybe-unused-unused + */ +#define __always_unused __attribute__((__unused__)) +#define __maybe_unused __attribute__((__unused__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-used-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-used-variable-attribute + */ +#define __used __attribute__((__used__)) + +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-weak-function-attribute + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-weak-variable-attribute + */ +#define __weak __attribute__((__weak__)) + +#endif /* __LINUX_COMPILER_ATTRIBUTES_H */ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index ed7c0e4a180e..3439d7d0249a 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -55,6 +55,9 @@ extern void __chk_io_ptr(const volatile void __iomem *); #ifdef __KERNEL__ +/* Attributes */ +#include + /* Compiler specific macros. */ #ifdef __clang__ #include @@ -79,12 +82,6 @@ extern void __chk_io_ptr(const volatile void __iomem *); #include #endif -/* - * Generic compiler-independent macros required for kernel - * build go below this comment. Actual compiler/compiler version - * specific implementations come from the above header files - */ - struct ftrace_branch_data { const char *func; const char *file; @@ -107,9 +104,6 @@ struct ftrace_likely_data { unsigned long constant; }; -/* Don't. Just don't. */ -#define __deprecated - #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ @@ -119,10 +113,6 @@ struct ftrace_likely_data { * compilers. We don't consider that to be an error, so set them to nothing. * For example, some of them are for compiler specific plugins. */ -#ifndef __designated_init -# define __designated_init -#endif - #ifndef __latent_entropy # define __latent_entropy #endif @@ -140,17 +130,6 @@ struct ftrace_likely_data { # define randomized_struct_fields_end #endif -#ifndef __visible -#define __visible -#endif - -/* - * Assume alignment of return value. - */ -#ifndef __assume_aligned -#define __assume_aligned(a, ...) -#endif - /* Are two types/vars the same type (ignoring qualifiers)? */ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) @@ -159,10 +138,6 @@ struct ftrace_likely_data { (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) -#ifndef __noclone -#define __noclone -#endif - /* Helpers for emitting diagnostics in pragmas. */ #ifndef __diag #define __diag(string) @@ -182,37 +157,6 @@ struct ftrace_likely_data { #define __diag_error(compiler, version, option, comment) \ __diag_ ## compiler(version, error, option) -/* - * From the GCC manual: - * - * Many functions have no effects except the return value and their - * return value depends only on the parameters and/or global - * variables. Such a function can be subject to common subexpression - * elimination and loop optimization just as an arithmetic operator - * would be. - * [...] - */ -#define __pure __attribute__((__pure__)) -#define __attribute_const__ __attribute__((__const__)) -#define __aligned(x) __attribute__((__aligned__(x))) -#define __aligned_largest __attribute__((__aligned__)) -#define __printf(a, b) __attribute__((__format__(printf, a, b))) -#define __scanf(a, b) __attribute__((__format__(scanf, a, b))) -#define __maybe_unused __attribute__((__unused__)) -#define __always_unused __attribute__((__unused__)) -#define __mode(x) __attribute__((__mode__(x))) -#define __malloc __attribute__((__malloc__)) -#define __used __attribute__((__used__)) -#define __noreturn __attribute__((__noreturn__)) -#define __packed __attribute__((__packed__)) -#define __weak __attribute__((__weak__)) -#define __alias(symbol) __attribute__((__alias__(#symbol))) -#define __cold __attribute__((__cold__)) -#define __section(S) __attribute__((__section__(#S))) -#define __always_inline inline __attribute__((__always_inline__)) -#define __gnu_inline __attribute__((__gnu_inline__)) - - #ifdef CONFIG_ENABLE_MUST_CHECK #define __must_check __attribute__((__warn_unused_result__)) #else @@ -246,18 +190,20 @@ struct ftrace_likely_data { * semantics rather than c99. This prevents multiple symbol definition errors * of extern inline functions at link time. * A lot of inline functions can cause havoc with function tracing. + * Do not use __always_inline here, since currently it expands to inline again + * (which would break users of __always_inline). */ #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) -#define inline \ - inline __attribute__((__always_inline__, __unused__)) notrace __gnu_inline +#define inline inline __attribute__((__always_inline__)) __gnu_inline \ + __maybe_unused notrace #else -#define inline inline __attribute__((__unused__)) notrace __gnu_inline +#define inline inline __gnu_inline \ + __maybe_unused notrace #endif #define __inline__ inline -#define __inline inline -#define noinline __attribute__((__noinline__)) +#define __inline inline /* * Rather then using noinline to prevent stack consumption, use -- cgit v1.2.3-71-gd317 From 06e3727e02f9ee9cf571692cd5c74fc5a8a2af52 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 3 Sep 2018 19:22:13 +0200 Subject: Compiler Attributes: KENTRY used twice the "used" attribute Tested-by: Sedat Dilek # on top of v4.19-rc5, clang 7 Reviewed-by: Nick Desaulniers Reviewed-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 4030a2940d6b..17ee9165ca51 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -146,7 +146,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, extern typeof(sym) sym; \ static const unsigned long __kentry_##sym \ __used \ - __attribute__((__section__("___kentry" "+" #sym ), used)) \ + __attribute__((__section__("___kentry" "+" #sym ))) \ = (unsigned long)&sym; #endif -- cgit v1.2.3-71-gd317 From e04462fb82f8dd98288c0e7ab1eec79c92537d25 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 3 Sep 2018 19:17:50 +0200 Subject: Compiler Attributes: remove uses of __attribute__ from compiler.h Suggested-by: Nick Desaulniers Tested-by: Sedat Dilek # on top of v4.19-rc5, clang 7 Reviewed-by: Nick Desaulniers Reviewed-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 17ee9165ca51..b5fb034fa6fa 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -23,8 +23,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __branch_check__(x, expect, is_constant) ({ \ long ______r; \ static struct ftrace_likely_data \ - __attribute__((__aligned__(4))) \ - __attribute__((__section__("_ftrace_annotated_branch"))) \ + __aligned(4) \ + __section("_ftrace_annotated_branch") \ ______f = { \ .data.func = __func__, \ .data.file = __FILE__, \ @@ -59,8 +59,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, ({ \ int ______r; \ static struct ftrace_branch_data \ - __attribute__((__aligned__(4))) \ - __attribute__((__section__("_ftrace_branch"))) \ + __aligned(4) \ + __section("_ftrace_branch") \ ______f = { \ .func = __func__, \ .file = __FILE__, \ @@ -146,7 +146,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, extern typeof(sym) sym; \ static const unsigned long __kentry_##sym \ __used \ - __attribute__((__section__("___kentry" "+" #sym ))) \ + __section("___kentry" "+" #sym ) \ = (unsigned long)&sym; #endif @@ -287,7 +287,7 @@ unsigned long read_word_at_a_time(const void *addr) * visible to the compiler. */ #define __ADDRESSABLE(sym) \ - static void * __attribute__((__section__(".discard.addressable"), used)) \ + static void * __section(".discard.addressable") __used \ __PASTE(__addressable_##sym, __LINE__) = (void *)&sym; /** -- cgit v1.2.3-71-gd317 From 92676236917d8e2e0de1d8612686d3d04a6a245b Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 19 Sep 2018 01:06:24 +0200 Subject: Compiler Attributes: add support for __nonstring (gcc >= 8) From the GCC manual: nonstring The nonstring variable attribute specifies that an object or member declaration with type array of char, signed char, or unsigned char, or pointer to such a type is intended to store character arrays that do not necessarily contain a terminating NUL. This is useful in detecting uses of such arrays or pointers with functions that expect NUL-terminated strings, and to avoid warnings when such an array or pointer is used as an argument to a bounded string manipulation function such as strncpy. https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html This attribute can be used for documentation purposes (i.e. replacing comments), but it is most helpful when the following warnings are enabled: -Wstringop-overflow Warn for calls to string manipulation functions such as memcpy and strcpy that are determined to overflow the destination buffer. [...] -Wstringop-truncation Warn for calls to bounded string manipulation functions such as strncat, strncpy, and stpncpy that may either truncate the copied string or leave the destination unchanged. [...] In situations where a character array is intended to store a sequence of bytes with no terminating NUL such an array may be annotated with attribute nonstring to avoid this warning. Such arrays, however, are not suitable arguments to functions that expect NUL-terminated strings. To help detect accidental misuses of such arrays GCC issues warnings unless it can prove that the use is safe. https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html Tested-by: Sedat Dilek # on top of v4.19-rc5, clang 7 Reviewed-by: Kees Cook Reviewed-by: Nick Desaulniers Reviewed-by: Luc Van Oostenryck Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index f0f9fc398440..6b28c1b7310c 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -34,6 +34,7 @@ # define __GCC4_has_attribute___externally_visible__ 1 # define __GCC4_has_attribute___noclone__ 1 # define __GCC4_has_attribute___optimize__ 1 +# define __GCC4_has_attribute___nonstring__ 0 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) #endif @@ -181,6 +182,19 @@ */ #define noinline __attribute__((__noinline__)) +/* + * Optional: only supported since gcc >= 8 + * Optional: not supported by clang + * Optional: not supported by icc + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-nonstring-variable-attribute + */ +#if __has_attribute(__nonstring__) +# define __nonstring __attribute__((__nonstring__)) +#else +# define __nonstring +#endif + /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-noreturn-function-attribute * clang: https://clang.llvm.org/docs/AttributeReference.html#noreturn -- cgit v1.2.3-71-gd317 From fe0640eb30b7da261ae84d252ed9ed3c7e68dfd8 Mon Sep 17 00:00:00 2001 From: "ndesaulniers@google.com" Date: Mon, 15 Oct 2018 10:22:21 -0700 Subject: compiler.h: update definition of unreachable() Fixes the objtool warning seen with Clang: arch/x86/mm/fault.o: warning: objtool: no_context()+0x220: unreachable instruction Fixes commit 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") Josh noted that the fallback definition was meant to work around a pre-gcc-4.6 bug. GCC still needs to work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365, so compiler-gcc.h defines its own version of unreachable(). Clang and ICC can use this shared definition. Link: https://github.com/ClangBuiltLinux/linux/issues/204 Suggested-by: Andy Lutomirski Suggested-by: Josh Poimboeuf Tested-by: Nathan Chancellor Signed-off-by: Nick Desaulniers Signed-off-by: Miguel Ojeda --- include/linux/compiler.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b5fb034fa6fa..2e0b6322588b 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -124,7 +124,10 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, # define ASM_UNREACHABLE #endif #ifndef unreachable -# define unreachable() do { annotate_reachable(); do { } while (1); } while (0) +# define unreachable() do { \ + annotate_unreachable(); \ + __builtin_unreachable(); \ +} while (0) #endif /* -- cgit v1.2.3-71-gd317 From 1ff2fea5e30ca15752777441ecb64a169fe22e9e Mon Sep 17 00:00:00 2001 From: "ndesaulniers@google.com" Date: Mon, 15 Oct 2018 14:24:27 -0700 Subject: compiler-gcc: remove comment about gcc 4.5 from unreachable() Remove the comment about being unable to detect __builtin_unreachable. __builtin_unreachable was implemented in the GCC 4.5 timeframe. The kernel's minimum supported version of GCC is 4.6 since commit cafa0010cd51 ("Raise the minimum required gcc version to 4.6"). Commit cb984d101b30 ("compiler-gcc: integrate the various compiler-gcc[345].h files") shows that unreachable() had different guards based on GCC version. Suggested-by: Miguel Ojeda Signed-off-by: Nick Desaulniers Signed-off-by: Miguel Ojeda --- include/linux/compiler-gcc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index cfac027e1625..2010493e1040 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -96,10 +96,6 @@ * Mark a position in code as unreachable. This can be used to * suppress control flow warnings after asm blocks that transfer * control elsewhere. - * - * Early snapshots of gcc 4.5 don't support this and we can't detect - * this in the preprocessor, but we can live with this because they're - * unreleased. Really, we need to have autoconf for the kernel. */ #define unreachable() \ do { \ -- cgit v1.2.3-71-gd317 From 00e23707442a75b404392cef1405ab4fd498de6b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 Oct 2018 13:07:28 +0100 Subject: iov_iter: Use accessor function Use accessor functions to access an iterator's type and direction. This allows for the possibility of using some other method of determining the type of iterator than if-chains with bitwise-AND conditions. Signed-off-by: David Howells --- block/bio.c | 2 +- fs/block_dev.c | 2 +- fs/ceph/file.c | 2 +- fs/cifs/file.c | 4 ++-- fs/cifs/misc.c | 2 +- fs/cifs/smbdirect.c | 17 ++++++++++++---- fs/direct-io.c | 2 +- fs/fuse/file.c | 2 +- fs/iomap.c | 2 +- include/linux/uio.h | 48 +++++++++++++++++++++++++++++-------------- lib/iov_iter.c | 56 +++++++++++++++++++++++++-------------------------- mm/filemap.c | 2 +- net/9p/trans_virtio.c | 2 +- net/tls/tls_sw.c | 4 ++-- 14 files changed, 87 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 0093bed81c0e..c55f36bbe12a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1255,7 +1255,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, /* * success */ - if (((iter->type & WRITE) && (!map_data || !map_data->null_mapped)) || + if ((iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) || (map_data && map_data->from_user)) { ret = bio_copy_from_iter(bio, iter); if (ret) diff --git a/fs/block_dev.c b/fs/block_dev.c index 38b8ce05cbc7..a80b4f0ee7c4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -349,7 +349,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) dio->size = 0; dio->multi_bio = false; - dio->should_dirty = is_read && (iter->type == ITER_IOVEC); + dio->should_dirty = is_read && iter_is_iovec(iter); blk_start_plug(&plug); for (;;) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 92ab20433682..524ecc95b04d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -658,7 +658,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, if (ret < 0) return ret; - if (unlikely(to->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(to))) { size_t page_off; ret = iov_iter_get_pages_alloc(to, &pages, len, &page_off); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8d41ca7bfcf1..dcdbcb6f09f8 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2990,7 +2990,7 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) size_t copy = min_t(size_t, remaining, PAGE_SIZE); size_t written; - if (unlikely(iter->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(iter))) { void *addr = kmap_atomic(page); written = copy_to_iter(addr, copy, iter); @@ -3302,7 +3302,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) if (!is_sync_kiocb(iocb)) ctx->iocb = iocb; - if (to->type == ITER_IOVEC) + if (iter_is_iovec(to)) ctx->should_dirty = true; rc = setup_aio_ctx_iter(ctx, to, READ); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 6926685e513c..7b5b960a04b8 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -786,7 +786,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) struct page **pages = NULL; struct bio_vec *bv = NULL; - if (iter->type & ITER_KVEC) { + if (iov_iter_is_kvec(iter)) { memcpy(&ctx->iter, iter, sizeof(struct iov_iter)); ctx->len = count; iov_iter_advance(iter, count); diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5fdb9a509a97..5b05bf255268 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -2054,14 +2054,22 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) info->smbd_recv_pending++; - switch (msg->msg_iter.type) { - case READ | ITER_KVEC: + if (iov_iter_rw(&msg->msg_iter) == WRITE) { + /* It's a bug in upper layer to get there */ + cifs_dbg(VFS, "CIFS: invalid msg iter dir %u\n", + iov_iter_rw(&msg->msg_iter)); + rc = -EINVAL; + goto out; + } + + switch (iov_iter_type(&msg->msg_iter)) { + case ITER_KVEC: buf = msg->msg_iter.kvec->iov_base; to_read = msg->msg_iter.kvec->iov_len; rc = smbd_recv_buf(info, buf, to_read); break; - case READ | ITER_BVEC: + case ITER_BVEC: page = msg->msg_iter.bvec->bv_page; page_offset = msg->msg_iter.bvec->bv_offset; to_read = msg->msg_iter.bvec->bv_len; @@ -2071,10 +2079,11 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) default: /* It's a bug in upper layer to get there */ cifs_dbg(VFS, "CIFS: invalid msg type %d\n", - msg->msg_iter.type); + iov_iter_type(&msg->msg_iter)); rc = -EINVAL; } +out: info->smbd_recv_pending--; wake_up(&info->wait_smbd_recv_pending); diff --git a/fs/direct-io.c b/fs/direct-io.c index 093fb54cd316..722d17c88edb 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1313,7 +1313,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, spin_lock_init(&dio->bio_lock); dio->refcount = 1; - dio->should_dirty = (iter->type == ITER_IOVEC); + dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ; sdio.iter = iter; sdio.final_block_in_request = end >> blkbits; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 32d0b883e74f..c9ccd45156dc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1271,7 +1271,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, ssize_t ret = 0; /* Special case for kernel I/O: can copy directly into the buffer */ - if (ii->type & ITER_KVEC) { + if (iov_iter_is_kvec(ii)) { unsigned long user_addr = fuse_get_user_addr(ii); size_t frag_size = fuse_get_frag_size(ii, *nbytesp); diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..2c53400aa802 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1795,7 +1795,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (pos >= dio->i_size) goto out_free_dio; - if (iter->type == ITER_IOVEC) + if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) dio->flags |= IOMAP_DIO_DIRTY; } else { flags |= IOMAP_WRITE; diff --git a/include/linux/uio.h b/include/linux/uio.h index 422b1c01ee0d..fcabc959c794 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -21,7 +21,7 @@ struct kvec { size_t iov_len; }; -enum { +enum iter_type { ITER_IOVEC = 0, ITER_KVEC = 2, ITER_BVEC = 4, @@ -47,6 +47,36 @@ struct iov_iter { }; }; +static inline enum iter_type iov_iter_type(const struct iov_iter *i) +{ + return i->type & ~(READ | WRITE); +} + +static inline bool iter_is_iovec(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_IOVEC; +} + +static inline bool iov_iter_is_kvec(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_KVEC; +} + +static inline bool iov_iter_is_bvec(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_BVEC; +} + +static inline bool iov_iter_is_pipe(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_PIPE; +} + +static inline unsigned char iov_iter_rw(const struct iov_iter *i) +{ + return i->type & (READ | WRITE); +} + /* * Total number of bytes covered by an iovec. * @@ -74,7 +104,8 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) } #define iov_for_each(iov, iter, start) \ - if (!((start).type & (ITER_BVEC | ITER_PIPE))) \ + if (iov_iter_type(start) == ITER_IOVEC || \ + iov_iter_type(start) == ITER_KVEC) \ for (iter = (start); \ (iter).count && \ ((iov = iov_iter_iovec(&(iter))), 1); \ @@ -202,19 +233,6 @@ static inline size_t iov_iter_count(const struct iov_iter *i) return i->count; } -static inline bool iter_is_iovec(const struct iov_iter *i) -{ - return !(i->type & (ITER_BVEC | ITER_KVEC | ITER_PIPE)); -} - -/* - * Get one of READ or WRITE out of iter->type without any other flags OR'd in - * with it. - * - * The ?: is just for type safety. - */ -#define iov_iter_rw(i) ((0 ? (struct iov_iter *)0 : (i))->type & (READ | WRITE)) - /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 8be175df3075..42d39116a556 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -558,7 +558,7 @@ static size_t copy_pipe_to_iter(const void *addr, size_t bytes, size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return copy_pipe_to_iter(addr, bytes, i); if (iter_is_iovec(i)) might_fault(); @@ -658,7 +658,7 @@ size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i) const char *from = addr; unsigned long rem, curr_addr, s_addr = (unsigned long) addr; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return copy_pipe_to_iter_mcsafe(addr, bytes, i); if (iter_is_iovec(i)) might_fault(); @@ -692,7 +692,7 @@ EXPORT_SYMBOL_GPL(_copy_to_iter_mcsafe); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -712,7 +712,7 @@ EXPORT_SYMBOL(_copy_from_iter); bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return false; } @@ -739,7 +739,7 @@ EXPORT_SYMBOL(_copy_from_iter_full); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -773,7 +773,7 @@ EXPORT_SYMBOL(_copy_from_iter_nocache); size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -794,7 +794,7 @@ EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return false; } @@ -836,7 +836,7 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); return wanted; - } else if (likely(!(i->type & ITER_PIPE))) + } else if (likely(!iov_iter_is_pipe(i))) return copy_page_to_iter_iovec(page, offset, bytes, i); else return copy_page_to_iter_pipe(page, offset, bytes, i); @@ -848,7 +848,7 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, { if (unlikely(!page_copy_sane(page, offset, bytes))) return 0; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -888,7 +888,7 @@ static size_t pipe_zero(size_t bytes, struct iov_iter *i) size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return pipe_zero(bytes, i); iterate_and_advance(i, bytes, v, clear_user(v.iov_base, v.iov_len), @@ -908,7 +908,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, kunmap_atomic(kaddr); return 0; } - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { kunmap_atomic(kaddr); WARN_ON(1); return 0; @@ -972,7 +972,7 @@ static void pipe_advance(struct iov_iter *i, size_t size) void iov_iter_advance(struct iov_iter *i, size_t size) { - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { pipe_advance(i, size); return; } @@ -987,7 +987,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { struct pipe_inode_info *pipe = i->pipe; int idx = i->idx; size_t off = i->iov_offset; @@ -1016,7 +1016,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) return; } unroll -= i->iov_offset; - if (i->type & ITER_BVEC) { + if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; @@ -1049,11 +1049,11 @@ EXPORT_SYMBOL(iov_iter_revert); */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return i->count; // it is a silly place, anyway if (i->nr_segs == 1) return i->count; - else if (i->type & ITER_BVEC) + else if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); else return min(i->count, i->iov->iov_len - i->iov_offset); @@ -1106,7 +1106,7 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) unsigned long res = 0; size_t size = i->count; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { if (size && i->iov_offset && allocated(&i->pipe->bufs[i->idx])) return size | i->iov_offset; return size; @@ -1125,7 +1125,7 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) unsigned long res = 0; size_t size = i->count; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return ~0U; } @@ -1193,7 +1193,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return pipe_get_pages(i, pages, maxsize, maxpages, start); iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; @@ -1205,7 +1205,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, len = maxpages * PAGE_SIZE; addr &= ~(PAGE_SIZE - 1); n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); + res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages); if (unlikely(res < 0)) return res; return (res == n ? len : res * PAGE_SIZE) - *start; @@ -1270,7 +1270,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (unlikely(i->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(i))) return pipe_get_pages_alloc(i, pages, maxsize, start); iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; @@ -1283,7 +1283,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, p = get_pages_array(n); if (!p) return -ENOMEM; - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); + res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p); if (unlikely(res < 0)) { kvfree(p); return res; @@ -1313,7 +1313,7 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return 0; } @@ -1355,7 +1355,7 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); return false; } @@ -1400,7 +1400,7 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { WARN_ON(1); /* for now */ return 0; } @@ -1443,7 +1443,7 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) if (!size) return 0; - if (unlikely(i->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(i))) { struct pipe_inode_info *pipe = i->pipe; size_t off; int idx; @@ -1481,11 +1481,11 @@ EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; - if (unlikely(new->type & ITER_PIPE)) { + if (unlikely(iov_iter_is_pipe(new))) { WARN_ON(1); return NULL; } - if (new->type & ITER_BVEC) + if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); diff --git a/mm/filemap.c b/mm/filemap.c index 52517f28e6f4..bdeee0168ea7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2122,7 +2122,7 @@ find_page: !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; /* pipes can't handle partially uptodate pages */ - if (unlikely(iter->type & ITER_PIPE)) + if (unlikely(iov_iter_is_pipe(iter))) goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 7728b0acde09..4d7d2070e9c8 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -322,7 +322,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, if (!iov_iter_count(data)) return 0; - if (!(data->type & ITER_KVEC)) { + if (iov_iter_is_kvec(data)) { int n; /* * We allow only p9_max_pages pinned. We wait for the diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index a525fc4c2a4b..ad64b9c8b600 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -799,7 +799,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) struct crypto_tfm *tfm = crypto_aead_tfm(ctx->aead_send); bool async_capable = tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; unsigned char record_type = TLS_RECORD_TYPE_DATA; - bool is_kvec = msg->msg_iter.type & ITER_KVEC; + bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy, copied = 0; struct sk_msg *msg_pl, *msg_en; @@ -1457,7 +1457,7 @@ int tls_sw_recvmsg(struct sock *sk, bool cmsg = false; int target, err = 0; long timeo; - bool is_kvec = msg->msg_iter.type & ITER_KVEC; + bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); int num_async = 0; flags |= nonblock; -- cgit v1.2.3-71-gd317 From aa563d7bca6e882ec2bdae24603c8f016401a144 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 20 Oct 2018 00:57:56 +0100 Subject: iov_iter: Separate type from direction and use accessor functions In the iov_iter struct, separate the iterator type from the iterator direction and use accessor functions to access them in most places. Convert a bunch of places to use switch-statements to access them rather then chains of bitwise-AND statements. This makes it easier to add further iterator types. Also, this can be more efficient as to implement a switch of small contiguous integers, the compiler can use ~50% fewer compare instructions than it has to use bitwise-and instructions. Further, cease passing the iterator type into the iterator setup function. The iterator function can set that itself. Only the direction is required. Signed-off-by: David Howells --- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/loop.c | 9 ++++----- drivers/block/nbd.c | 12 +++++------- drivers/fsi/fsi-sbefifo.c | 4 ++-- drivers/isdn/mISDN/l1oip_core.c | 3 +-- drivers/misc/vmw_vmci/vmci_queue_pair.c | 6 +++--- drivers/nvme/target/io-cmd-file.c | 2 +- drivers/target/iscsi/iscsi_target_util.c | 6 ++---- drivers/target/target_core_file.c | 6 +++--- drivers/usb/usbip/usbip_common.c | 2 +- drivers/xen/pvcalls-back.c | 8 ++++---- fs/9p/vfs_addr.c | 4 ++-- fs/9p/vfs_dir.c | 2 +- fs/9p/xattr.c | 4 ++-- fs/afs/rxrpc.c | 15 +++++++-------- fs/ceph/file.c | 5 ++--- fs/cifs/connect.c | 4 ++-- fs/cifs/misc.c | 2 +- fs/cifs/smb2ops.c | 4 ++-- fs/cifs/transport.c | 8 +++----- fs/dlm/lowcomms.c | 2 +- fs/nfsd/vfs.c | 4 ++-- fs/ocfs2/cluster/tcp.c | 2 +- fs/orangefs/inode.c | 2 +- fs/splice.c | 7 +++---- include/linux/uio.h | 10 +++++----- lib/iov_iter.c | 28 +++++++++++++++------------- mm/page_io.c | 2 +- net/9p/client.c | 2 +- net/bluetooth/6lowpan.c | 2 +- net/bluetooth/a2mp.c | 2 +- net/bluetooth/smp.c | 2 +- net/ceph/messenger.c | 6 +++--- net/netfilter/ipvs/ip_vs_sync.c | 2 +- net/smc/smc_clc.c | 4 ++-- net/socket.c | 6 +++--- net/sunrpc/svcsock.c | 2 +- net/tipc/topsrv.c | 2 +- net/tls/tls_device.c | 4 ++-- 40 files changed, 96 insertions(+), 105 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index ef8212a4b73e..ded9735d44af 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1856,7 +1856,7 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock, /* THINK if (signal_pending) return ... ? */ - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1, size); + iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size); if (sock == connection->data.socket) { rcu_read_lock(); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 75f6b47169e6..fcc70642b004 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -516,7 +516,7 @@ static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flag struct msghdr msg = { .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) }; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, size); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, size); return sock_recvmsg(sock, &msg, msg.msg_flags); } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ea9debf59b22..cb0cc8685076 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -268,7 +268,7 @@ static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) struct iov_iter i; ssize_t bw; - iov_iter_bvec(&i, ITER_BVEC | WRITE, bvec, 1, bvec->bv_len); + iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len); file_start_write(file); bw = vfs_iter_write(file, &i, ppos, 0); @@ -346,7 +346,7 @@ static int lo_read_simple(struct loop_device *lo, struct request *rq, ssize_t len; rq_for_each_segment(bvec, rq, iter) { - iov_iter_bvec(&i, ITER_BVEC, &bvec, 1, bvec.bv_len); + iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len); len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); if (len < 0) return len; @@ -387,7 +387,7 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq, b.bv_offset = 0; b.bv_len = bvec.bv_len; - iov_iter_bvec(&i, ITER_BVEC, &b, 1, b.bv_len); + iov_iter_bvec(&i, READ, &b, 1, b.bv_len); len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); if (len < 0) { ret = len; @@ -554,8 +554,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, } atomic_set(&cmd->ref, 2); - iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, - segments, blk_rq_bytes(rq)); + iov_iter_bvec(&iter, rw, bvec, segments, blk_rq_bytes(rq)); iter.iov_offset = offset; cmd->iocb.ki_pos = pos; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 14a51254c3db..4d4d6129ff66 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -473,7 +473,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) u32 nbd_cmd_flags = 0; int sent = nsock->sent, skip = 0; - iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request)); + iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); switch (req_op(req)) { case REQ_OP_DISCARD: @@ -564,8 +564,7 @@ send_pages: dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n", req, bvec.bv_len); - iov_iter_bvec(&from, ITER_BVEC | WRITE, - &bvec, 1, bvec.bv_len); + iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len); if (skip) { if (skip >= iov_iter_count(&from)) { skip -= iov_iter_count(&from); @@ -624,7 +623,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) int ret = 0; reply.magic = 0; - iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply)); + iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply)); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); if (result <= 0) { if (!nbd_disconnected(config)) @@ -678,8 +677,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) struct bio_vec bvec; rq_for_each_segment(bvec, req, iter) { - iov_iter_bvec(&to, ITER_BVEC | READ, - &bvec, 1, bvec.bv_len); + iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len); result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", @@ -1073,7 +1071,7 @@ static void send_disconnects(struct nbd_device *nbd) for (i = 0; i < config->num_connections; i++) { struct nbd_sock *nsock = config->socks[i]; - iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request)); + iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); mutex_lock(&nsock->tx_lock); ret = sock_xmit(nbd, i, 1, &from, 0, NULL); if (ret <= 0) diff --git a/drivers/fsi/fsi-sbefifo.c b/drivers/fsi/fsi-sbefifo.c index ae861342626e..d92f5b87c251 100644 --- a/drivers/fsi/fsi-sbefifo.c +++ b/drivers/fsi/fsi-sbefifo.c @@ -638,7 +638,7 @@ static void sbefifo_collect_async_ffdc(struct sbefifo *sbefifo) } ffdc_iov.iov_base = ffdc; ffdc_iov.iov_len = SBEFIFO_MAX_FFDC_SIZE; - iov_iter_kvec(&ffdc_iter, WRITE | ITER_KVEC, &ffdc_iov, 1, SBEFIFO_MAX_FFDC_SIZE); + iov_iter_kvec(&ffdc_iter, WRITE, &ffdc_iov, 1, SBEFIFO_MAX_FFDC_SIZE); cmd[0] = cpu_to_be32(2); cmd[1] = cpu_to_be32(SBEFIFO_CMD_GET_SBE_FFDC); rc = sbefifo_do_command(sbefifo, cmd, 2, &ffdc_iter); @@ -735,7 +735,7 @@ int sbefifo_submit(struct device *dev, const __be32 *command, size_t cmd_len, rbytes = (*resp_len) * sizeof(__be32); resp_iov.iov_base = response; resp_iov.iov_len = rbytes; - iov_iter_kvec(&resp_iter, WRITE | ITER_KVEC, &resp_iov, 1, rbytes); + iov_iter_kvec(&resp_iter, WRITE, &resp_iov, 1, rbytes); /* Perform the command */ mutex_lock(&sbefifo->lock); diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c index b05022f94f18..072bb5e36c18 100644 --- a/drivers/isdn/mISDN/l1oip_core.c +++ b/drivers/isdn/mISDN/l1oip_core.c @@ -718,8 +718,7 @@ l1oip_socket_thread(void *data) printk(KERN_DEBUG "%s: socket created and open\n", __func__); while (!signal_pending(current)) { - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, - recvbuf_size); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, recvbuf_size); recvlen = sock_recvmsg(socket, &msg, 0); if (recvlen > 0) { l1oip_socket_parse(hc, &sin_rx, recvbuf, recvlen); diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c index bd52f29b4a4e..264f4ed8eef2 100644 --- a/drivers/misc/vmw_vmci/vmci_queue_pair.c +++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c @@ -3030,7 +3030,7 @@ ssize_t vmci_qpair_enqueue(struct vmci_qp *qpair, if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; - iov_iter_kvec(&from, WRITE | ITER_KVEC, &v, 1, buf_size); + iov_iter_kvec(&from, WRITE, &v, 1, buf_size); qp_lock(qpair); @@ -3074,7 +3074,7 @@ ssize_t vmci_qpair_dequeue(struct vmci_qp *qpair, if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; - iov_iter_kvec(&to, READ | ITER_KVEC, &v, 1, buf_size); + iov_iter_kvec(&to, READ, &v, 1, buf_size); qp_lock(qpair); @@ -3119,7 +3119,7 @@ ssize_t vmci_qpair_peek(struct vmci_qp *qpair, if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; - iov_iter_kvec(&to, READ | ITER_KVEC, &v, 1, buf_size); + iov_iter_kvec(&to, READ, &v, 1, buf_size); qp_lock(qpair); diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 81a9dc5290a8..97f282eca3c2 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -101,7 +101,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, rw = READ; } - iov_iter_bvec(&iter, ITER_BVEC | rw, req->f.bvec, nr_segs, count); + iov_iter_bvec(&iter, rw, req->f.bvec, nr_segs, count); iocb->ki_pos = pos; iocb->ki_filp = req->ns->file; diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c index 49be1e41290c..991482576417 100644 --- a/drivers/target/iscsi/iscsi_target_util.c +++ b/drivers/target/iscsi/iscsi_target_util.c @@ -1258,8 +1258,7 @@ static int iscsit_do_rx_data( return -1; memset(&msg, 0, sizeof(struct msghdr)); - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, - count->iov, count->iov_count, data); + iov_iter_kvec(&msg.msg_iter, READ, count->iov, count->iov_count, data); while (msg_data_left(&msg)) { rx_loop = sock_recvmsg(conn->sock, &msg, MSG_WAITALL); @@ -1315,8 +1314,7 @@ int tx_data( memset(&msg, 0, sizeof(struct msghdr)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, - iov, iov_count, data); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, iov_count, data); while (msg_data_left(&msg)) { int tx_loop = sock_sendmsg(conn->sock, &msg); diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 16751ae55d7b..49b110d1b972 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -303,7 +303,7 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, len += sg->length; } - iov_iter_bvec(&iter, ITER_BVEC | is_write, bvec, sgl_nents, len); + iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len); aio_cmd->cmd = cmd; aio_cmd->len = len; @@ -353,7 +353,7 @@ static int fd_do_rw(struct se_cmd *cmd, struct file *fd, len += sg->length; } - iov_iter_bvec(&iter, ITER_BVEC, bvec, sgl_nents, len); + iov_iter_bvec(&iter, READ, bvec, sgl_nents, len); if (is_write) ret = vfs_iter_write(fd, &iter, &pos, 0); else @@ -490,7 +490,7 @@ fd_execute_write_same(struct se_cmd *cmd) len += se_dev->dev_attrib.block_size; } - iov_iter_bvec(&iter, ITER_BVEC, bvec, nolb, len); + iov_iter_bvec(&iter, READ, bvec, nolb, len); ret = vfs_iter_write(fd_dev->fd_file, &iter, &pos, 0); kfree(bvec); diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c index 9756752c0681..45da3e01c7b0 100644 --- a/drivers/usb/usbip/usbip_common.c +++ b/drivers/usb/usbip/usbip_common.c @@ -309,7 +309,7 @@ int usbip_recv(struct socket *sock, void *buf, int size) if (!sock || !buf || !size) return -EINVAL; - iov_iter_kvec(&msg.msg_iter, READ|ITER_KVEC, &iov, 1, size); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, size); usbip_dbg_xmit("enter\n"); diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index b1092fbefa63..2e5d845b5091 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -137,13 +137,13 @@ static void pvcalls_conn_back_read(void *opaque) if (masked_prod < masked_cons) { vec[0].iov_base = data->in + masked_prod; vec[0].iov_len = wanted; - iov_iter_kvec(&msg.msg_iter, ITER_KVEC|WRITE, vec, 1, wanted); + iov_iter_kvec(&msg.msg_iter, WRITE, vec, 1, wanted); } else { vec[0].iov_base = data->in + masked_prod; vec[0].iov_len = array_size - masked_prod; vec[1].iov_base = data->in; vec[1].iov_len = wanted - vec[0].iov_len; - iov_iter_kvec(&msg.msg_iter, ITER_KVEC|WRITE, vec, 2, wanted); + iov_iter_kvec(&msg.msg_iter, WRITE, vec, 2, wanted); } atomic_set(&map->read, 0); @@ -195,13 +195,13 @@ static void pvcalls_conn_back_write(struct sock_mapping *map) if (pvcalls_mask(prod, array_size) > pvcalls_mask(cons, array_size)) { vec[0].iov_base = data->out + pvcalls_mask(cons, array_size); vec[0].iov_len = size; - iov_iter_kvec(&msg.msg_iter, ITER_KVEC|READ, vec, 1, size); + iov_iter_kvec(&msg.msg_iter, READ, vec, 1, size); } else { vec[0].iov_base = data->out + pvcalls_mask(cons, array_size); vec[0].iov_len = array_size - pvcalls_mask(cons, array_size); vec[1].iov_base = data->out; vec[1].iov_len = size - vec[0].iov_len; - iov_iter_kvec(&msg.msg_iter, ITER_KVEC|READ, vec, 2, size); + iov_iter_kvec(&msg.msg_iter, READ, vec, 2, size); } atomic_set(&map->write, 0); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index e1cbdfdb7c68..0bcbcc20f769 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -65,7 +65,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) if (retval == 0) return retval; - iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE); + iov_iter_bvec(&to, READ, &bvec, 1, PAGE_SIZE); retval = p9_client_read(fid, page_offset(page), &to, &err); if (err) { @@ -175,7 +175,7 @@ static int v9fs_vfs_writepage_locked(struct page *page) bvec.bv_page = page; bvec.bv_offset = 0; bvec.bv_len = len; - iov_iter_bvec(&from, ITER_BVEC | WRITE, &bvec, 1, len); + iov_iter_bvec(&from, WRITE, &bvec, 1, len); /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index b0405d6aac85..d5db3c968a03 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -133,7 +133,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) if (rdir->tail == rdir->head) { struct iov_iter to; int n; - iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen); + iov_iter_kvec(&to, READ, &kvec, 1, buflen); n = p9_client_read(file->private_data, ctx->pos, &to, &err); if (err) diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 352abc39e891..ac8ff8ca4c11 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -32,7 +32,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, struct iov_iter to; int err; - iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buffer_size); + iov_iter_kvec(&to, READ, &kvec, 1, buffer_size); attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { @@ -107,7 +107,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, struct iov_iter from; int retval, err; - iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len); + iov_iter_kvec(&from, WRITE, &kvec, 1, value_len); p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", name, value_len, flags); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 77a83790a31f..639c16882e93 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -286,7 +286,7 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, offset = 0; } - iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, bv, nr, bytes); + iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes); } /* @@ -401,8 +401,7 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, - call->request_size); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0); @@ -432,7 +431,7 @@ error_do_abort: rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); } else { - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, false, &call->abort_code, &call->service_id); @@ -468,7 +467,7 @@ static void afs_deliver_to_call(struct afs_call *call) if (state == AFS_CALL_SV_AWAIT_ACK) { struct iov_iter iter; - iov_iter_kvec(&iter, READ | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&iter, READ, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, call->rxcall, &iter, false, &remote_abort, @@ -825,7 +824,7 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -864,7 +863,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -905,7 +904,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, iov.iov_base = buf + call->offset; iov.iov_len = count - call->offset; - iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, count - call->offset); + iov_iter_kvec(&iter, READ, &iov, 1, count - call->offset); ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, &iter, want_more, &remote_abort, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 524ecc95b04d..5dd433aa9b23 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -821,7 +821,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) aio_req->total_len = rc + zlen; } - iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs, + iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, osd_data->num_bvecs, osd_data->bvec_pos.iter.bi_size); iov_iter_advance(&i, rc); @@ -1044,8 +1044,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, int zlen = min_t(size_t, len - ret, size - pos - ret); - iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages, - len); + iov_iter_bvec(&i, READ, bvecs, num_pages, len); iov_iter_advance(&i, ret); iov_iter_zero(zlen, &i); ret += zlen; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 52d71b64c0c6..11bcd2fb90b1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -588,7 +588,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, { struct msghdr smb_msg; struct kvec iov = {.iov_base = buf, .iov_len = to_read}; - iov_iter_kvec(&smb_msg.msg_iter, READ | ITER_KVEC, &iov, 1, to_read); + iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } @@ -600,7 +600,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, struct msghdr smb_msg; struct bio_vec bv = { .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; - iov_iter_bvec(&smb_msg.msg_iter, READ | ITER_BVEC, &bv, 1, to_read); + iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 7b5b960a04b8..10cff44832d8 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -857,7 +857,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) ctx->bv = bv; ctx->len = saved_len - count; ctx->npages = npages; - iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len); + iov_iter_bvec(&ctx->iter, rw, ctx->bv, npages, ctx->len); return 0; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 89985a0a6819..26f8a65b8722 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2962,13 +2962,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - iov_iter_bvec(&iter, WRITE | ITER_BVEC, bvec, npages, data_len); + iov_iter_bvec(&iter, WRITE, bvec, npages, data_len); } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ WARN_ONCE(npages > 0, "read data can be either in buf or in pages"); iov.iov_base = buf + data_offset; iov.iov_len = data_len; - iov_iter_kvec(&iter, WRITE | ITER_KVEC, &iov, 1, data_len); + iov_iter_kvec(&iter, WRITE, &iov, 1, data_len); } else { /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index b48f43963da6..146b618802a5 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -307,8 +307,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, .iov_base = &rfc1002_marker, .iov_len = 4 }; - iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, &hiov, - 1, 4); + iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) goto uncork; @@ -329,8 +328,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, size += iov[i].iov_len; } - iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, - iov, n_vec, size); + iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) @@ -346,7 +344,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rqst_page_get_length(&rqst[j], i, &bvec.bv_len, &bvec.bv_offset); - iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC, + iov_iter_bvec(&smb_msg.msg_iter, WRITE, &bvec, 1, bvec.bv_len); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a5e4a221435c..76976d6e50f9 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -674,7 +674,7 @@ static int receive_from_sock(struct connection *con) nvec = 2; } len = iov[0].iov_len + iov[1].iov_len; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nvec, len); + iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len); r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL); if (ret <= 0) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b53e76391e52..8b90f5480904 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -923,7 +923,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, int host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count); + iov_iter_kvec(&iter, READ, vec, vlen, *count); host_err = vfs_iter_read(file, &iter, &offset, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); } @@ -999,7 +999,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (stable && !use_wgather) flags |= RWF_SYNC; - iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt); + iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) goto out_nfserr; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 7d9eea7d4a87..e9f236af1927 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -916,7 +916,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { struct kvec vec = { .iov_len = len, .iov_base = data, }; struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, len); + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len); return sock_recvmsg(sock, &msg, MSG_DONTWAIT); } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 31932879b716..136a8bdc1d91 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -25,7 +25,7 @@ static int read_one_page(struct page *page) struct iov_iter to; struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE}; - iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE); + iov_iter_bvec(&to, READ, &bv, 1, PAGE_SIZE); gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpage called with page %p\n", diff --git a/fs/splice.c b/fs/splice.c index b3daa971f597..3553f1956508 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -301,7 +301,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct kiocb kiocb; int idx, ret; - iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len); + iov_iter_pipe(&to, READ, pipe, len); idx = to.idx; init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; @@ -386,7 +386,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, */ offset = *ppos & ~PAGE_MASK; - iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset); + iov_iter_pipe(&to, READ, pipe, len + offset); res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base); if (res <= 0) @@ -745,8 +745,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, left -= this_len; } - iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, - sd.total_len - left); + iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; diff --git a/include/linux/uio.h b/include/linux/uio.h index fcabc959c794..3d8f1acc142c 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -29,7 +29,7 @@ enum iter_type { }; struct iov_iter { - int type; + unsigned int type; size_t iov_offset; size_t count; union { @@ -212,13 +212,13 @@ size_t copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i) size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); -void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, +void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); -void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec, +void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); -void iov_iter_bvec(struct iov_iter *i, int direction, const struct bio_vec *bvec, +void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); -void iov_iter_pipe(struct iov_iter *i, int direction, struct pipe_inode_info *pipe, +void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count); ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 42d39116a556..c42b928b15ef 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -428,17 +428,19 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) } EXPORT_SYMBOL(iov_iter_fault_in_readable); -void iov_iter_init(struct iov_iter *i, int direction, +void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { + WARN_ON(direction & ~(READ | WRITE)); + direction &= READ | WRITE; + /* It will get better. Eventually... */ if (uaccess_kernel()) { - direction |= ITER_KVEC; - i->type = direction; + i->type = ITER_KVEC | direction; i->kvec = (struct kvec *)iov; } else { - i->type = direction; + i->type = ITER_IOVEC | direction; i->iov = iov; } i->nr_segs = nr_segs; @@ -1060,12 +1062,12 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_single_seg_count); -void iov_iter_kvec(struct iov_iter *i, int direction, +void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { - BUG_ON(!(direction & ITER_KVEC)); - i->type = direction; + WARN_ON(direction & ~(READ | WRITE)); + i->type = ITER_KVEC | (direction & (READ | WRITE)); i->kvec = kvec; i->nr_segs = nr_segs; i->iov_offset = 0; @@ -1073,12 +1075,12 @@ void iov_iter_kvec(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_kvec); -void iov_iter_bvec(struct iov_iter *i, int direction, +void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { - BUG_ON(!(direction & ITER_BVEC)); - i->type = direction; + WARN_ON(direction & ~(READ | WRITE)); + i->type = ITER_BVEC | (direction & (READ | WRITE)); i->bvec = bvec; i->nr_segs = nr_segs; i->iov_offset = 0; @@ -1086,13 +1088,13 @@ void iov_iter_bvec(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_bvec); -void iov_iter_pipe(struct iov_iter *i, int direction, +void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count) { - BUG_ON(direction != ITER_PIPE); + BUG_ON(direction != READ); WARN_ON(pipe->nrbufs == pipe->buffers); - i->type = direction; + i->type = ITER_PIPE | READ; i->pipe = pipe; i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); i->iov_offset = 0; diff --git a/mm/page_io.c b/mm/page_io.c index aafd19ec1db4..86de453a60cf 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -294,7 +294,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, }; struct iov_iter from; - iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); + iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE); init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); diff --git a/net/9p/client.c b/net/9p/client.c index deae53a7dffc..a9cd1401bd09 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -2070,7 +2070,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) struct kvec kv = {.iov_base = data, .iov_len = count}; struct iov_iter to; - iov_iter_kvec(&to, READ | ITER_KVEC, &kv, 1, count); + iov_iter_kvec(&to, READ, &kv, 1, count); p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n", fid->fid, (unsigned long long) offset, count); diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 4e2576fc0c59..828e87fe8027 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -467,7 +467,7 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, iv.iov_len = skb->len; memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, skb->len); + iov_iter_kvec(&msg.msg_iter, WRITE, &iv, 1, skb->len); err = l2cap_chan_send(chan, &msg, skb->len); if (err > 0) { diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 51c2cf2d8923..58fc6333d412 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -63,7 +63,7 @@ static void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *dat memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, total_len); + iov_iter_kvec(&msg.msg_iter, WRITE, &iv, 1, total_len); l2cap_chan_send(chan, &msg, total_len); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index a1c1b7e8a45c..c822e626761b 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -622,7 +622,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) memset(&msg, 0, sizeof(msg)); - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iv, 2, 1 + len); + iov_iter_kvec(&msg.msg_iter, WRITE, iv, 2, 1 + len); l2cap_chan_send(chan, &msg, 1 + len); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0a187196aeed..e493ff77b378 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -526,7 +526,7 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) if (!buf) msg.msg_flags |= MSG_TRUNC; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) r = 0; @@ -545,7 +545,7 @@ static int ceph_tcp_recvpage(struct socket *sock, struct page *page, int r; BUG_ON(page_offset + length > PAGE_SIZE); - iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, &bvec, 1, length); + iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) r = 0; @@ -607,7 +607,7 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, else msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, &bvec, 1, size); + iov_iter_bvec(&msg.msg_iter, WRITE, &bvec, 1, size); ret = sock_sendmsg(sock, &msg); if (ret == -EAGAIN) ret = 0; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index d4020c5e831d..2526be6b3d90 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1616,7 +1616,7 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) EnterFunction(7); /* Receive a packet */ - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, buflen); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen); len = sock_recvmsg(sock, &msg, MSG_DONTWAIT); if (len < 0) return len; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 52241d679cc9..89c3a8c7859a 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -286,7 +286,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, */ krflags = MSG_PEEK | MSG_WAITALL; smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); if (signal_pending(current)) { @@ -325,7 +325,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, /* receive the complete CLC message */ memset(&msg, 0, sizeof(struct msghdr)); - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen); + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, datlen); krflags = MSG_WAITALL; len = sock_recvmsg(smc->clcsock, &msg, krflags); if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { diff --git a/net/socket.c b/net/socket.c index b68801c7d0ab..fae408abea54 100644 --- a/net/socket.c +++ b/net/socket.c @@ -635,7 +635,7 @@ EXPORT_SYMBOL(sock_sendmsg); int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { - iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); + iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size); return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); @@ -648,7 +648,7 @@ int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, if (!sock->ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); - iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); + iov_iter_kvec(&msg->msg_iter, WRITE, vec, num, size); return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg)); } @@ -823,7 +823,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, mm_segment_t oldfs = get_fs(); int result; - iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size); + iov_iter_kvec(&msg->msg_iter, READ, vec, num, size); set_fs(KERNEL_DS); result = sock_recvmsg(sock, msg, flags); set_fs(oldfs); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5445145e639c..0b46ec0bf74e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -338,7 +338,7 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, rqstp->rq_xprt_hlen = 0; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, iov, nr, buflen); + iov_iter_kvec(&msg.msg_iter, READ, iov, nr, buflen); len = sock_recvmsg(svsk->sk_sock, &msg, msg.msg_flags); /* If we read a full record, then assume there may be more * data to read (stream based sockets only!) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index d8956f7daac4..afa02eeec403 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -394,7 +394,7 @@ static int tipc_conn_rcv_from_sock(struct tipc_conn *con) iov.iov_base = &s; iov.iov_len = sizeof(s); msg.msg_name = NULL; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, iov.iov_len); ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); if (ret == -EWOULDBLOCK) return -EWOULDBLOCK; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 276edbc04f38..d753e362d2d9 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -489,7 +489,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, iov.iov_base = kaddr + offset; iov.iov_len = size; - iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size); + iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size); rc = tls_push_data(sk, &msg_iter, size, flags, TLS_RECORD_TYPE_DATA); kunmap(page); @@ -538,7 +538,7 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) { struct iov_iter msg_iter; - iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); + iov_iter_kvec(&msg_iter, WRITE, NULL, 0, 0); return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); } -- cgit v1.2.3-71-gd317 From 9ea9ce0427aab02a2fd88fc608267cf6952119f1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 20 Oct 2018 00:57:56 +0100 Subject: iov_iter: Add I/O discard iterator Add a new iterator, ITER_DISCARD, that can only be used in READ mode and just discards any data copied to it. This is useful in a network filesystem for discarding any unwanted data sent by a server. Signed-off-by: David Howells --- include/linux/uio.h | 7 +++++++ lib/iov_iter.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 3d8f1acc142c..55ce99ddb912 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -26,6 +26,7 @@ enum iter_type { ITER_KVEC = 2, ITER_BVEC = 4, ITER_PIPE = 8, + ITER_DISCARD = 16, }; struct iov_iter { @@ -72,6 +73,11 @@ static inline bool iov_iter_is_pipe(const struct iov_iter *i) return iov_iter_type(i) == ITER_PIPE; } +static inline bool iov_iter_is_discard(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_DISCARD; +} + static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->type & (READ | WRITE); @@ -220,6 +226,7 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_ unsigned long nr_segs, size_t count); void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count); +void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index c42b928b15ef..7ebccb5c1637 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -83,6 +83,7 @@ const struct kvec *kvec; \ struct kvec v; \ iterate_kvec(i, n, v, kvec, skip, (K)) \ + } else if (unlikely(i->type & ITER_DISCARD)) { \ } else { \ const struct iovec *iov; \ struct iovec v; \ @@ -114,6 +115,8 @@ } \ i->nr_segs -= kvec - i->kvec; \ i->kvec = kvec; \ + } else if (unlikely(i->type & ITER_DISCARD)) { \ + skip += n; \ } else { \ const struct iovec *iov; \ struct iovec v; \ @@ -838,7 +841,9 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); return wanted; - } else if (likely(!iov_iter_is_pipe(i))) + } else if (unlikely(iov_iter_is_discard(i))) + return bytes; + else if (likely(!iov_iter_is_pipe(i))) return copy_page_to_iter_iovec(page, offset, bytes, i); else return copy_page_to_iter_pipe(page, offset, bytes, i); @@ -850,7 +855,7 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, { if (unlikely(!page_copy_sane(page, offset, bytes))) return 0; - if (unlikely(iov_iter_is_pipe(i))) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return 0; } @@ -910,7 +915,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, kunmap_atomic(kaddr); return 0; } - if (unlikely(iov_iter_is_pipe(i))) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { kunmap_atomic(kaddr); WARN_ON(1); return 0; @@ -978,6 +983,10 @@ void iov_iter_advance(struct iov_iter *i, size_t size) pipe_advance(i, size); return; } + if (unlikely(iov_iter_is_discard(i))) { + i->count -= size; + return; + } iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); @@ -1013,6 +1022,8 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) pipe_truncate(i); return; } + if (unlikely(iov_iter_is_discard(i))) + return; if (unroll <= i->iov_offset) { i->iov_offset -= unroll; return; @@ -1055,6 +1066,8 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) return i->count; // it is a silly place, anyway if (i->nr_segs == 1) return i->count; + if (unlikely(iov_iter_is_discard(i))) + return i->count; else if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); else @@ -1103,6 +1116,24 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_pipe); +/** + * iov_iter_discard - Initialise an I/O iterator that discards data + * @i: The iterator to initialise. + * @direction: The direction of the transfer. + * @count: The size of the I/O buffer in bytes. + * + * Set up an I/O iterator that just discards everything that's written to it. + * It's only available as a READ iterator. + */ +void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) +{ + BUG_ON(direction != READ); + i->type = ITER_DISCARD | READ; + i->count = count; + i->iov_offset = 0; +} +EXPORT_SYMBOL(iov_iter_discard); + unsigned long iov_iter_alignment(const struct iov_iter *i) { unsigned long res = 0; @@ -1127,7 +1158,7 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) unsigned long res = 0; size_t size = i->count; - if (unlikely(iov_iter_is_pipe(i))) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return ~0U; } @@ -1197,6 +1228,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (unlikely(iov_iter_is_pipe(i))) return pipe_get_pages(i, pages, maxsize, maxpages, start); + if (unlikely(iov_iter_is_discard(i))) + return -EFAULT; + iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -1274,6 +1308,9 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (unlikely(iov_iter_is_pipe(i))) return pipe_get_pages_alloc(i, pages, maxsize, start); + if (unlikely(iov_iter_is_discard(i))) + return -EFAULT; + iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -1315,7 +1352,7 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(iov_iter_is_pipe(i))) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return 0; } @@ -1357,7 +1394,7 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(iov_iter_is_pipe(i))) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); return false; } @@ -1402,7 +1439,7 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; - if (unlikely(iov_iter_is_pipe(i))) { + if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { WARN_ON(1); /* for now */ return 0; } @@ -1444,6 +1481,8 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) if (!size) return 0; + if (unlikely(iov_iter_is_discard(i))) + return 0; if (unlikely(iov_iter_is_pipe(i))) { struct pipe_inode_info *pipe = i->pipe; @@ -1487,6 +1526,8 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) WARN_ON(1); return NULL; } + if (unlikely(iov_iter_is_discard(new))) + return NULL; if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), -- cgit v1.2.3-71-gd317 From 70025f84e5b79627a6739533c4fe7cef5b605886 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 9 Oct 2018 17:46:51 +0100 Subject: KEYS: Provide key type operations for asymmetric key ops [ver #2] Provide five new operations in the key_type struct that can be used to provide access to asymmetric key operations. These will be implemented for the asymmetric key type in a later patch and may refer to a key retained in RAM by the kernel or a key retained in crypto hardware. int (*asym_query)(const struct kernel_pkey_params *params, struct kernel_pkey_query *info); int (*asym_eds_op)(struct kernel_pkey_params *params, const void *in, void *out); int (*asym_verify_signature)(struct kernel_pkey_params *params, const void *in, const void *in2); Since encrypt, decrypt and sign are identical in their interfaces, they're rolled together in the asym_eds_op() operation and there's an operation ID in the params argument to distinguish them. Verify is different in that we supply the data and the signature instead and get an error value (or 0) as the only result on the expectation that this may well be how a hardware crypto device may work. Signed-off-by: David Howells Tested-by: Marcel Holtmann Reviewed-by: Marcel Holtmann Reviewed-by: Denis Kenzior Tested-by: Denis Kenzior Signed-off-by: James Morris --- Documentation/security/keys/core.rst | 106 +++++++++++++++++++++++++++++++++++ include/linux/key-type.h | 11 ++++ include/linux/keyctl.h | 46 +++++++++++++++ include/uapi/linux/keyctl.h | 5 ++ 4 files changed, 168 insertions(+) create mode 100644 include/linux/keyctl.h (limited to 'include/linux') diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index 9ce7256c6edb..c144978479d5 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -1483,6 +1483,112 @@ The structure has a number of fields, some of which are mandatory: attempted key link operation. If there is no match, -EINVAL is returned. + * ``int (*asym_eds_op)(struct kernel_pkey_params *params, + const void *in, void *out);`` + ``int (*asym_verify_signature)(struct kernel_pkey_params *params, + const void *in, const void *in2);`` + + These methods are optional. If provided the first allows a key to be + used to encrypt, decrypt or sign a blob of data, and the second allows a + key to verify a signature. + + In all cases, the following information is provided in the params block:: + + struct kernel_pkey_params { + struct key *key; + const char *encoding; + const char *hash_algo; + char *info; + __u32 in_len; + union { + __u32 out_len; + __u32 in2_len; + }; + enum kernel_pkey_operation op : 8; + }; + + This includes the key to be used; a string indicating the encoding to use + (for instance, "pkcs1" may be used with an RSA key to indicate + RSASSA-PKCS1-v1.5 or RSAES-PKCS1-v1.5 encoding or "raw" if no encoding); + the name of the hash algorithm used to generate the data for a signature + (if appropriate); the sizes of the input and output (or second input) + buffers; and the ID of the operation to be performed. + + For a given operation ID, the input and output buffers are used as + follows:: + + Operation ID in,in_len out,out_len in2,in2_len + ======================= =============== =============== =============== + kernel_pkey_encrypt Raw data Encrypted data - + kernel_pkey_decrypt Encrypted data Raw data - + kernel_pkey_sign Raw data Signature - + kernel_pkey_verify Raw data - Signature + + asym_eds_op() deals with encryption, decryption and signature creation as + specified by params->op. Note that params->op is also set for + asym_verify_signature(). + + Encrypting and signature creation both take raw data in the input buffer + and return the encrypted result in the output buffer. Padding may have + been added if an encoding was set. In the case of signature creation, + depending on the encoding, the padding created may need to indicate the + digest algorithm - the name of which should be supplied in hash_algo. + + Decryption takes encrypted data in the input buffer and returns the raw + data in the output buffer. Padding will get checked and stripped off if + an encoding was set. + + Verification takes raw data in the input buffer and the signature in the + second input buffer and checks that the one matches the other. Padding + will be validated. Depending on the encoding, the digest algorithm used + to generate the raw data may need to be indicated in hash_algo. + + If successful, asym_eds_op() should return the number of bytes written + into the output buffer. asym_verify_signature() should return 0. + + A variety of errors may be returned, including EOPNOTSUPP if the operation + is not supported; EKEYREJECTED if verification fails; ENOPKG if the + required crypto isn't available. + + + * ``int (*asym_query)(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info);`` + + This method is optional. If provided it allows information about the + public or asymmetric key held in the key to be determined. + + The parameter block is as for asym_eds_op() and co. but in_len and out_len + are unused. The encoding and hash_algo fields should be used to reduce + the returned buffer/data sizes as appropriate. + + If successful, the following information is filled in:: + + struct kernel_pkey_query { + __u32 supported_ops; + __u32 key_size; + __u16 max_data_size; + __u16 max_sig_size; + __u16 max_enc_size; + __u16 max_dec_size; + }; + + The supported_ops field will contain a bitmask indicating what operations + are supported by the key, including encryption of a blob, decryption of a + blob, signing a blob and verifying the signature on a blob. The following + constants are defined for this:: + + KEYCTL_SUPPORTS_{ENCRYPT,DECRYPT,SIGN,VERIFY} + + The key_size field is the size of the key in bits. max_data_size and + max_sig_size are the maximum raw data and signature sizes for creation and + verification of a signature; max_enc_size and max_dec_size are the maximum + raw data and signature sizes for encryption and decryption. The + max_*_size fields are measured in bytes. + + If successful, 0 will be returned. If the key doesn't support this, + EOPNOTSUPP will be returned. + + Request-Key Callback Service ============================ diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 05d8fb5a06c4..bc9af551fc83 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -17,6 +17,9 @@ #ifdef CONFIG_KEYS +struct kernel_pkey_query; +struct kernel_pkey_params; + /* * key under-construction record * - passed to the request_key actor if supplied @@ -155,6 +158,14 @@ struct key_type { */ struct key_restriction *(*lookup_restriction)(const char *params); + /* Asymmetric key accessor functions. */ + int (*asym_query)(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info); + int (*asym_eds_op)(struct kernel_pkey_params *params, + const void *in, void *out); + int (*asym_verify_signature)(struct kernel_pkey_params *params, + const void *in, const void *in2); + /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h new file mode 100644 index 000000000000..c7c48c79ce0e --- /dev/null +++ b/include/linux/keyctl.h @@ -0,0 +1,46 @@ +/* keyctl kernel bits + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef __LINUX_KEYCTL_H +#define __LINUX_KEYCTL_H + +#include + +struct kernel_pkey_query { + __u32 supported_ops; /* Which ops are supported */ + __u32 key_size; /* Size of the key in bits */ + __u16 max_data_size; /* Maximum size of raw data to sign in bytes */ + __u16 max_sig_size; /* Maximum size of signature in bytes */ + __u16 max_enc_size; /* Maximum size of encrypted blob in bytes */ + __u16 max_dec_size; /* Maximum size of decrypted blob in bytes */ +}; + +enum kernel_pkey_operation { + kernel_pkey_encrypt, + kernel_pkey_decrypt, + kernel_pkey_sign, + kernel_pkey_verify, +}; + +struct kernel_pkey_params { + struct key *key; + const char *encoding; /* Encoding (eg. "oaep" or "raw" for none) */ + const char *hash_algo; /* Digest algorithm used (eg. "sha1") or NULL if N/A */ + char *info; /* Modified info string to be released later */ + __u32 in_len; /* Input data size */ + union { + __u32 out_len; /* Output buffer size (enc/dec/sign) */ + __u32 in2_len; /* 2nd input data size (verify) */ + }; + enum kernel_pkey_operation op : 8; +}; + +#endif /* __LINUX_KEYCTL_H */ diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index 0f3cb13db8e9..1d1e9f2877af 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -82,4 +82,9 @@ struct keyctl_kdf_params { __u32 __spare[8]; }; +#define KEYCTL_SUPPORTS_ENCRYPT 0x01 +#define KEYCTL_SUPPORTS_DECRYPT 0x02 +#define KEYCTL_SUPPORTS_SIGN 0x04 +#define KEYCTL_SUPPORTS_VERIFY 0x08 + #endif /* _LINUX_KEYCTL_H */ -- cgit v1.2.3-71-gd317 From 1383a7ed67490fb00d793e36c7a4d599ff88a64d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:40:31 +1100 Subject: vfs: check file ranges before cloning files Move the file range checks from vfs_clone_file_prep into a separate generic_remap_checks function so that all the checks are collected in a central location. This forms the basis for adding more checks from generic_write_checks that will make cloning's input checking more consistent with write input checking. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/ocfs2/refcounttree.c | 2 +- fs/read_write.c | 55 ++++++++++----------------------------- fs/xfs/xfs_reflink.c | 2 +- include/linux/fs.h | 9 ++++--- mm/filemap.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 90 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7a5ee145c733..19e03936c5e1 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4850,7 +4850,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) goto out_unlock; - ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, + ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out, &len, is_dedupe); if (ret <= 0) goto out_unlock; diff --git a/fs/read_write.c b/fs/read_write.c index 260797b01851..d6e8e242a15f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1717,13 +1717,12 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) * Returns: 0 for "nothing to clone", 1 for "something to clone", or * the usual negative error code. */ -int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, - struct inode *inode_out, loff_t pos_out, - u64 *len, bool is_dedupe) +int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 *len, bool is_dedupe) { - loff_t bs = inode_out->i_sb->s_blocksize; - loff_t blen; - loff_t isize; + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); bool same_inode = (inode_in == inode_out); int ret; @@ -1740,10 +1739,10 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; - isize = i_size_read(inode_in); - /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) { + loff_t isize = i_size_read(inode_in); + if (is_dedupe || pos_in == isize) return 0; if (pos_in > isize) @@ -1751,36 +1750,11 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, *len = isize - pos_in; } - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + *len < pos_in || pos_out + *len < pos_out || - pos_in + *len > isize) - return -EINVAL; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + *len > disize) - return -EINVAL; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + *len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = *len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - return -EINVAL; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode) { - if (pos_out + blen > pos_in && pos_out < pos_in + blen) - return -EINVAL; - } + /* Check that we don't violate system file offset limits. */ + ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, + is_dedupe); + if (ret) + return ret; /* Wait for the completion of any pending IOs on both files */ inode_dio_wait(inode_in); @@ -1813,7 +1787,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, return 1; } -EXPORT_SYMBOL(vfs_clone_file_prep_inodes); +EXPORT_SYMBOL(vfs_clone_file_prep); int do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) @@ -1851,9 +1825,6 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, if (ret) return ret; - if (pos_in + len > i_size_read(inode_in)) - return -EINVAL; - ret = file_in->f_op->clone_file_range(file_in, pos_in, file_out, pos_out, len); if (!ret) { diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 42ea7bab9144..281d5f53f2ec 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1326,7 +1326,7 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, + ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out, len, is_dedupe); if (ret <= 0) goto out_unlock; diff --git a/include/linux/fs.h b/include/linux/fs.h index 897eae8faee1..ba93a6e7dac4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1825,9 +1825,9 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *, unsigned long, loff_t *, rwf_t); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); -extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, - struct inode *inode_out, loff_t pos_out, - u64 *len, bool is_dedupe); +extern int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 *count, bool is_dedupe); extern int do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, @@ -2967,6 +2967,9 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); +extern int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + uint64_t *count, bool is_dedupe); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/filemap.c b/mm/filemap.c index 52517f28e6f4..47e6bfd45a91 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2974,6 +2974,75 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) } EXPORT_SYMBOL(generic_write_checks); +/* + * Performs necessary checks before doing a clone. + * + * Can adjust amount of bytes to clone. + * Returns appropriate error code that caller should return or + * zero in case the clone should be allowed. + */ +int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + uint64_t *req_count, bool is_dedupe) +{ + struct inode *inode_in = file_in->f_mapping->host; + struct inode *inode_out = file_out->f_mapping->host; + uint64_t count = *req_count; + uint64_t bcount; + loff_t size_in, size_out; + loff_t bs = inode_out->i_sb->s_blocksize; + + /* The start of both ranges must be aligned to an fs block. */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EINVAL; + + size_in = i_size_read(inode_in); + size_out = i_size_read(inode_out); + + /* Dedupe requires both ranges to be within EOF. */ + if (is_dedupe && + (pos_in >= size_in || pos_in + count > size_in || + pos_out >= size_out || pos_out + count > size_out)) + return -EINVAL; + + /* Ensure the infile range is within the infile. */ + if (pos_in >= size_in) + return -EINVAL; + count = min(count, size_in - (uint64_t)pos_in); + + /* + * If the user wanted us to link to the infile's EOF, round up to the + * next block boundary for this check. + * + * Otherwise, make sure the count is also block-aligned, having + * already confirmed the starting offsets' block alignment. + */ + if (pos_in + count == size_in) { + bcount = ALIGN(size_in, bs) - pos_in; + } else { + if (!IS_ALIGNED(count, bs)) + return -EINVAL; + + bcount = count; + } + + /* Don't allow overlapped cloning within the same file. */ + if (inode_in == inode_out && + pos_out + bcount > pos_in && + pos_out < pos_in + bcount) + return -EINVAL; + + /* For now we don't support changing the length. */ + if (*req_count != count) + return -EINVAL; + + return 0; +} + int pagecache_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) -- cgit v1.2.3-71-gd317 From a83ab01a62e61616ebb8b97f90f568c1214dc10d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:08 +1100 Subject: vfs: rename vfs_clone_file_prep to be more descriptive The vfs_clone_file_prep is a generic function to be called by filesystem implementations only. Rename the prefix to generic_ and make it more clear that it applies to remap operations, not just clones. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/ocfs2/refcounttree.c | 2 +- fs/read_write.c | 8 ++++---- fs/xfs/xfs_reflink.c | 2 +- include/linux/fs.h | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 19e03936c5e1..36c56dfbe485 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4850,7 +4850,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) goto out_unlock; - ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out, + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, &len, is_dedupe); if (ret <= 0) goto out_unlock; diff --git a/fs/read_write.c b/fs/read_write.c index f5395d8da741..aca75a97a695 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1745,9 +1745,9 @@ static int generic_remap_check_len(struct inode *inode_in, * Returns: 0 for "nothing to clone", 1 for "something to clone", or * the usual negative error code. */ -int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 *len, bool is_dedupe) +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 *len, bool is_dedupe) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1822,7 +1822,7 @@ int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, return 1; } -EXPORT_SYMBOL(vfs_clone_file_prep); +EXPORT_SYMBOL(generic_remap_file_range_prep); int do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 281d5f53f2ec..a7757a128a78 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1326,7 +1326,7 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out, + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, is_dedupe); if (ret <= 0) goto out_unlock; diff --git a/include/linux/fs.h b/include/linux/fs.h index ba93a6e7dac4..55729e1c2e75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1825,9 +1825,9 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *, unsigned long, loff_t *, rwf_t); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); -extern int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 *count, bool is_dedupe); +extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 *count, bool is_dedupe); extern int do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, -- cgit v1.2.3-71-gd317 From 2e5dfc99f2e61c42083ba742395e7a7b353513d1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:21 +1100 Subject: vfs: combine the clone and dedupe into a single remap_file_range Combine the clone_file_range and dedupe_file_range operations into a single remap_file_range file operation dispatch since they're fundamentally the same operation. The differences between the two can be made in the prep functions. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- Documentation/filesystems/porting | 5 +++++ Documentation/filesystems/vfs.txt | 20 +++++++++++------ fs/btrfs/ctree.h | 8 +++---- fs/btrfs/file.c | 3 +-- fs/btrfs/ioctl.c | 45 ++++++++++++++++++++------------------- fs/cifs/cifsfs.c | 22 +++++++++++-------- fs/nfs/nfs4file.c | 10 ++++++--- fs/ocfs2/file.c | 24 +++++++-------------- fs/overlayfs/file.c | 30 +++++++++++++++----------- fs/read_write.c | 18 ++++++++-------- fs/xfs/xfs_file.c | 23 ++++++-------------- include/linux/fs.h | 25 ++++++++++++++++++---- 12 files changed, 127 insertions(+), 106 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 7b7b845c490a..e6d4466268dd 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -622,3 +622,8 @@ in your dentry operations instead. alloc_file_clone(file, flags, ops) does not affect any caller's references. On success you get a new struct file sharing the mount/dentry with the original, on failure - ERR_PTR(). +-- +[mandatory] + ->clone_file_range() and ->dedupe_file_range have been replaced with + ->remap_file_range(). See Documentation/filesystems/vfs.txt for more + information. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index a6c6a8af48a2..6f5babfee27b 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -883,8 +883,9 @@ struct file_operations { unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + int (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); }; @@ -960,11 +961,16 @@ otherwise noted. copy_file_range: called by the copy_file_range(2) system call. - clone_file_range: called by the ioctl(2) system call for FICLONERANGE and - FICLONE commands. - - dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE - command. + remap_file_range: called by the ioctl(2) system call for FICLONERANGE and + FICLONE and FIDEDUPERANGE commands to remap file ranges. An + implementation should remap len bytes at pos_in of the source file into + the dest file at pos_out. Implementations must handle callers passing + in len == 0; this means "remap to the end of the source file". The + return value should be zero if all bytes were remapped, or the usual + negative error code if the remapping did not succeed completely. + The remap_flags parameter accepts REMAP_FILE_* flags. If + REMAP_FILE_DEDUP is set then the implementation must only remap if the + requested file ranges have identical contents. fadvise: possibly called by the fadvise64() system call. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2cddfe7806a4..124a05662fc2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3218,9 +3218,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); -int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, - struct file *dst_file, loff_t dst_loff, - u64 olen); /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3250,8 +3247,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); +int btrfs_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len, + unsigned int remap_flags); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2be00e873e92..9a963f061393 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3269,8 +3269,7 @@ const struct file_operations btrfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif - .clone_file_range = btrfs_clone_file_range, - .dedupe_file_range = btrfs_dedupe_file_range, + .remap_file_range = btrfs_remap_file_range, }; void __cold btrfs_auto_defrag_exit(void) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d60b6caf09e8..bfd99c66723e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3627,26 +3627,6 @@ out_unlock: return ret; } -int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, - struct file *dst_file, loff_t dst_loff, - u64 olen) -{ - struct inode *src = file_inode(src_file); - struct inode *dst = file_inode(dst_file); - u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - - if (WARN_ON_ONCE(bs < PAGE_SIZE)) { - /* - * Btrfs does not support blocksize < page_size. As a - * result, btrfs_cmp_data() won't correctly handle - * this situation without an update. - */ - return -EINVAL; - } - - return btrfs_extent_same(src, src_loff, olen, dst, dst_loff); -} - static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, @@ -4348,9 +4328,30 @@ out_unlock: return ret; } -int btrfs_clone_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, u64 len) +int btrfs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, u64 len, + unsigned int remap_flags) { + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) { + struct inode *src = file_inode(src_file); + struct inode *dst = file_inode(dst_file); + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + + if (WARN_ON_ONCE(bs < PAGE_SIZE)) { + /* + * Btrfs does not support blocksize < page_size. As a + * result, btrfs_cmp_data() won't correctly handle + * this situation without an update. + */ + return -EINVAL; + } + + return btrfs_extent_same(src, off, len, dst, destoff); + } + return btrfs_clone_files(dst_file, src_file, off, len, destoff); } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 7065426b3280..e8144d0dcde2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -975,8 +975,9 @@ const struct inode_operations cifs_symlink_inode_ops = { .listxattr = cifs_listxattr, }; -static int cifs_clone_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, u64 len) +static int cifs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, u64 len, + unsigned int remap_flags) { struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); @@ -986,6 +987,9 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off, unsigned int xid; int rc; + if (remap_flags & ~REMAP_FILE_ADVISORY) + return -EINVAL; + cifs_dbg(FYI, "clone range\n"); xid = get_xid(); @@ -1134,7 +1138,7 @@ const struct file_operations cifs_file_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1153,7 +1157,7 @@ const struct file_operations cifs_file_strict_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1172,7 +1176,7 @@ const struct file_operations cifs_file_direct_ops = { .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -1191,7 +1195,7 @@ const struct file_operations cifs_file_nobrl_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1209,7 +1213,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1227,7 +1231,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -1239,7 +1243,7 @@ const struct file_operations cifs_dir_ops = { .read = generic_read_dir, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, - .clone_file_range = cifs_clone_file_range, + .remap_file_range = cifs_remap_file_range, .llseek = generic_file_llseek, .fsync = cifs_dir_fsync, }; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4288a6ecaf75..ae5780ce41dc 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -180,8 +180,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t return nfs42_proc_allocate(filep, offset, len); } -static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, - struct file *dst_file, loff_t dst_off, u64 count) +static int nfs42_remap_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, u64 count, + unsigned int remap_flags) { struct inode *dst_inode = file_inode(dst_file); struct nfs_server *server = NFS_SERVER(dst_inode); @@ -190,6 +191,9 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, bool same_inode = false; int ret; + if (remap_flags & ~REMAP_FILE_ADVISORY) + return -EINVAL; + /* check alignment w.r.t. clone_blksize */ ret = -EINVAL; if (bs) { @@ -262,7 +266,7 @@ const struct file_operations nfs4_file_operations = { .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, .fallocate = nfs42_fallocate, - .clone_file_range = nfs42_clone_file_range, + .remap_file_range = nfs42_remap_file_range, #else .llseek = nfs_file_llseek, #endif diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9fa35cb6f6e0..0b757a24567c 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2527,24 +2527,18 @@ out: return offset; } -static int ocfs2_file_clone_range(struct file *file_in, +static int ocfs2_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len) + u64 len, + unsigned int remap_flags) { - return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); -} + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; -static int ocfs2_file_dedupe_range(struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) -{ return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, true); + len, remap_flags & REMAP_FILE_DEDUP); } const struct inode_operations ocfs2_file_iops = { @@ -2586,8 +2580,7 @@ const struct file_operations ocfs2_fops = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, - .clone_file_range = ocfs2_file_clone_range, - .dedupe_file_range = ocfs2_file_dedupe_range, + .remap_file_range = ocfs2_remap_file_range, }; const struct file_operations ocfs2_dops = { @@ -2633,8 +2626,7 @@ const struct file_operations ocfs2_fops_no_plocks = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, - .clone_file_range = ocfs2_file_clone_range, - .dedupe_file_range = ocfs2_file_dedupe_range, + .remap_file_range = ocfs2_remap_file_range, }; const struct file_operations ocfs2_dops_no_plocks = { diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 986313da0c88..fffb36fd5920 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -489,26 +489,31 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, OVL_COPY); } -static int ovl_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +static int ovl_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 len, unsigned int remap_flags) { - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, - OVL_CLONE); -} + enum ovl_copyop op; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) + op = OVL_DEDUPE; + else + op = OVL_CLONE; -static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) -{ /* * Don't copy up because of a dedupe request, this wouldn't make sense * most of the time (data would be duplicated instead of deduplicated). */ - if (!ovl_inode_upper(file_inode(file_in)) || - !ovl_inode_upper(file_inode(file_out))) + if (op == OVL_DEDUPE && + (!ovl_inode_upper(file_inode(file_in)) || + !ovl_inode_upper(file_inode(file_out)))) return -EPERM; return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, - OVL_DEDUPE); + op); } const struct file_operations ovl_file_operations = { @@ -525,6 +530,5 @@ const struct file_operations ovl_file_operations = { .compat_ioctl = ovl_compat_ioctl, .copy_file_range = ovl_copy_file_range, - .clone_file_range = ovl_clone_file_range, - .dedupe_file_range = ovl_dedupe_file_range, + .remap_file_range = ovl_remap_file_range, }; diff --git a/fs/read_write.c b/fs/read_write.c index 734c5661fb69..766bdcb381f3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1588,9 +1588,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * Try cloning first, this is supported by more file systems, and * more efficient if both clone and copy are supported (e.g. NFS). */ - if (file_in->f_op->clone_file_range) { - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); + if (file_in->f_op->remap_file_range) { + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, 0); if (ret == 0) { ret = len; goto done; @@ -1849,7 +1849,7 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, (file_out->f_flags & O_APPEND)) return -EBADF; - if (!file_in->f_op->clone_file_range) + if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; ret = remap_verify_area(file_in, pos_in, len, false); @@ -1860,8 +1860,8 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, if (ret) return ret; - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, 0); if (!ret) { fsnotify_access(file_in); fsnotify_modify(file_out); @@ -2006,7 +2006,7 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, goto out_drop_write; ret = -EINVAL; - if (!dst_file->f_op->dedupe_file_range) + if (!dst_file->f_op->remap_file_range) goto out_drop_write; if (len == 0) { @@ -2014,8 +2014,8 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, goto out_drop_write; } - ret = dst_file->f_op->dedupe_file_range(src_file, src_pos, - dst_file, dst_pos, len); + ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, + dst_pos, len, REMAP_FILE_DEDUP); out_drop_write: mnt_drop_write_file(dst_file); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 61a5ad2600e8..2ad94d508f80 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -920,27 +920,19 @@ out_unlock: } STATIC int -xfs_file_clone_range( +xfs_file_remap_range( struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len) + u64 len, + unsigned int remap_flags) { - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); -} + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; -STATIC int -xfs_file_dedupe_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) -{ return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, true); + len, remap_flags & REMAP_FILE_DEDUP); } STATIC int @@ -1175,8 +1167,7 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, - .clone_file_range = xfs_file_clone_range, - .dedupe_file_range = xfs_file_dedupe_range, + .remap_file_range = xfs_file_remap_range, }; const struct file_operations xfs_dir_file_operations = { diff --git a/include/linux/fs.h b/include/linux/fs.h index 55729e1c2e75..888cef35c7d7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1721,6 +1721,24 @@ struct block_device_operations; #define NOMMU_VMFLAGS \ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) +/* + * These flags control the behavior of the remap_file_range function pointer. + * If it is called with len == 0 that means "remap to end of source file". + * See Documentation/filesystems/vfs.txt for more details about this call. + * + * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) + */ +#define REMAP_FILE_DEDUP (1 << 0) + +/* + * These flags signal that the caller is ok with altering various aspects of + * the behavior of the remap operation. The changes must be made by the + * implementation; the vfs remap helper functions can take advantage of them. + * Flags in this category exist to preserve the quirky behavior of the hoisted + * btrfs clone/dedupe ioctls. + * There are no flags yet, but subsequent commits will add some. + */ +#define REMAP_FILE_ADVISORY (0) struct iov_iter; @@ -1759,10 +1777,9 @@ struct file_operations { #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, - u64); - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, - u64); + int (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); } __randomize_layout; -- cgit v1.2.3-71-gd317 From a91ae49bbaf43910edb09e03fedf26b23875bd52 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:28 +1100 Subject: vfs: pass remap flags to generic_remap_file_range_prep Plumb the remap flags through the filesystem from the vfs function dispatcher all the way to the prep function to prepare for behavior changes in subsequent patches. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/ocfs2/file.c | 2 +- fs/ocfs2/refcounttree.c | 4 ++-- fs/ocfs2/refcounttree.h | 2 +- fs/read_write.c | 14 +++++++------- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_reflink.c | 21 +++++++++++---------- fs/xfs/xfs_reflink.h | 3 ++- include/linux/fs.h | 2 +- 8 files changed, 26 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0b757a24567c..9809b0e5746f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2538,7 +2538,7 @@ static int ocfs2_remap_file_range(struct file *file_in, return -EINVAL; return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, remap_flags & REMAP_FILE_DEDUP); + len, remap_flags); } const struct inode_operations ocfs2_file_iops = { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 36c56dfbe485..df9781567ec0 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4825,7 +4825,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, struct file *file_out, loff_t pos_out, u64 len, - bool is_dedupe) + unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -4851,7 +4851,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, goto out_unlock; ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - &len, is_dedupe); + &len, remap_flags); if (ret <= 0) goto out_unlock; diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 4af55bf4b35b..d2c5f526edff 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -120,6 +120,6 @@ int ocfs2_reflink_remap_range(struct file *file_in, struct file *file_out, loff_t pos_out, u64 len, - bool is_dedupe); + unsigned int remap_flags); #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/read_write.c b/fs/read_write.c index 766bdcb381f3..201381689284 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1722,14 +1722,14 @@ static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, loff_t pos_out, u64 *len, - bool is_dedupe) + unsigned int remap_flags) { u64 blkmask = i_blocksize(inode_in) - 1; if ((*len & blkmask) == 0) return 0; - if (is_dedupe) + if (remap_flags & REMAP_FILE_DEDUP) *len &= ~blkmask; else if (pos_out + *len < i_size_read(inode_out)) return -EINVAL; @@ -1747,7 +1747,7 @@ static int generic_remap_check_len(struct inode *inode_in, */ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *len, bool is_dedupe) + u64 *len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1771,7 +1771,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (*len == 0) { loff_t isize = i_size_read(inode_in); - if (is_dedupe || pos_in == isize) + if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) return 0; if (pos_in > isize) return -EINVAL; @@ -1782,7 +1782,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - is_dedupe); + (remap_flags & REMAP_FILE_DEDUP)); if (ret) return ret; @@ -1804,7 +1804,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* * Check that the extents are the same. */ - if (is_dedupe) { + if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; ret = vfs_dedupe_file_range_compare(inode_in, pos_in, @@ -1816,7 +1816,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, } ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - is_dedupe); + remap_flags); if (ret) return ret; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2ad94d508f80..20314eb4677a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -932,7 +932,7 @@ xfs_file_remap_range( return -EINVAL; return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, remap_flags & REMAP_FILE_DEDUP); + len, remap_flags); } STATIC int diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index a7757a128a78..29aab196ce7e 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -921,13 +921,14 @@ xfs_reflink_update_dest( struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, - bool is_dedupe) + unsigned int remap_flags) { struct xfs_mount *mp = dest->i_mount; struct xfs_trans *tp; int error; - if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + if ((remap_flags & REMAP_FILE_DEDUP) && + newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); @@ -948,7 +949,7 @@ xfs_reflink_update_dest( dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; } - if (!is_dedupe) { + if (!(remap_flags & REMAP_FILE_DEDUP)) { xfs_trans_ichgtime(tp, dest, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); } @@ -1296,7 +1297,7 @@ xfs_reflink_remap_prep( struct file *file_out, loff_t pos_out, u64 *len, - bool is_dedupe) + unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct xfs_inode *src = XFS_I(inode_in); @@ -1327,7 +1328,7 @@ xfs_reflink_remap_prep( goto out_unlock; ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - len, is_dedupe); + len, remap_flags); if (ret <= 0) goto out_unlock; @@ -1336,7 +1337,7 @@ xfs_reflink_remap_prep( * from the source file so we don't try to dedupe the partial * EOF block. */ - if (is_dedupe) { + if (remap_flags & REMAP_FILE_DEDUP) { *len &= ~blkmask; } else if (*len & blkmask) { /* @@ -1372,7 +1373,7 @@ xfs_reflink_remap_prep( PAGE_ALIGN(pos_out + *len) - 1); /* If we're altering the file contents... */ - if (!is_dedupe) { + if (!(remap_flags & REMAP_FILE_DEDUP)) { /* * ...update the timestamps (which will grab the ilock again * from xfs_fs_dirty_inode, so we have to call it before we @@ -1410,7 +1411,7 @@ xfs_reflink_remap_range( struct file *file_out, loff_t pos_out, u64 len, - bool is_dedupe) + unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct xfs_inode *src = XFS_I(inode_in); @@ -1430,7 +1431,7 @@ xfs_reflink_remap_range( /* Prepare and then clone file data. */ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, - &len, is_dedupe); + &len, remap_flags); if (ret <= 0) return ret; @@ -1457,7 +1458,7 @@ xfs_reflink_remap_range( cowextsize = src->i_d.di_cowextsize; ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, - is_dedupe); + remap_flags); out_unlock: xfs_reflink_remap_unlock(file_in, file_out); diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index c585ad9552b2..6f82d628bf17 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -28,7 +28,8 @@ extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); + struct file *file_out, loff_t pos_out, u64 len, + unsigned int remap_flags); extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *has_shared); extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, diff --git a/include/linux/fs.h b/include/linux/fs.h index 888cef35c7d7..631c28ce1436 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1844,7 +1844,7 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *count, bool is_dedupe); + u64 *count, unsigned int remap_flags); extern int do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, -- cgit v1.2.3-71-gd317 From 3d28193e1df043764deb7abdaba5e3a6660bc393 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:34 +1100 Subject: vfs: pass remap flags to generic_remap_checks Pass the same remap flags to generic_remap_checks for consistency. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/read_write.c | 2 +- include/linux/fs.h | 2 +- mm/filemap.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/read_write.c b/fs/read_write.c index 201381689284..ebcbfc4f2907 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1782,7 +1782,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - (remap_flags & REMAP_FILE_DEDUP)); + remap_flags); if (ret) return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index 631c28ce1436..c5435ca81132 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2986,7 +2986,7 @@ extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - uint64_t *count, bool is_dedupe); + uint64_t *count, unsigned int remap_flags); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/filemap.c b/mm/filemap.c index 84b7301e41a0..410dc58f7b16 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2994,7 +2994,7 @@ EXPORT_SYMBOL(generic_write_checks); */ int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - uint64_t *req_count, bool is_dedupe) + uint64_t *req_count, unsigned int remap_flags) { struct inode *inode_in = file_in->f_mapping->host; struct inode *inode_out = file_out->f_mapping->host; @@ -3016,7 +3016,7 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in, size_out = i_size_read(inode_out); /* Dedupe requires both ranges to be within EOF. */ - if (is_dedupe && + if ((remap_flags & REMAP_FILE_DEDUP) && (pos_in >= size_in || pos_in + count > size_in || pos_out >= size_out || pos_out + count > size_out)) return -EINVAL; -- cgit v1.2.3-71-gd317 From 42ec3d4c02187a18e27ff94b409ec27234bf2ffd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:49 +1100 Subject: vfs: make remap_file_range functions take and return bytes completed Change the remap_file_range functions to take a number of bytes to operate upon and return the number of bytes they operated on. This is a requirement for allowing fs implementations to return short clone/dedupe results to the user, which will enable us to obey resource limits in a graceful manner. A subsequent patch will enable copy_file_range to signal to the ->clone_file_range implementation that it can handle a short length, which will be returned in the function's return value. For now the short return is not implemented anywhere so the behavior won't change -- either copy_file_range manages to clone the entire range or it tries an alternative. Neither clone ioctl can take advantage of this, alas. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- Documentation/filesystems/vfs.txt | 10 ++++---- fs/btrfs/ctree.h | 6 ++--- fs/btrfs/ioctl.c | 13 +++++++---- fs/cifs/cifsfs.c | 6 ++--- fs/ioctl.c | 10 +++++++- fs/nfs/nfs4file.c | 6 ++--- fs/nfsd/vfs.c | 8 +++++-- fs/ocfs2/file.c | 16 ++++++------- fs/ocfs2/refcounttree.c | 2 +- fs/ocfs2/refcounttree.h | 2 +- fs/overlayfs/copy_up.c | 6 ++--- fs/overlayfs/file.c | 12 +++++----- fs/read_write.c | 49 +++++++++++++++++++++------------------ fs/xfs/xfs_file.c | 9 ++++--- fs/xfs/xfs_reflink.c | 4 ++-- fs/xfs/xfs_reflink.h | 2 +- include/linux/fs.h | 27 +++++++++++---------- mm/filemap.c | 2 +- 18 files changed, 108 insertions(+), 82 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 6f5babfee27b..1bd2919deaca 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -883,9 +883,9 @@ struct file_operations { unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - int (*remap_file_range)(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 len, unsigned int remap_flags); + loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); }; @@ -966,8 +966,8 @@ otherwise noted. implementation should remap len bytes at pos_in of the source file into the dest file at pos_out. Implementations must handle callers passing in len == 0; this means "remap to the end of the source file". The - return value should be zero if all bytes were remapped, or the usual - negative error code if the remapping did not succeed completely. + return value should the number of bytes remapped, or the usual + negative error code if errors occurred before any bytes were remapped. The remap_flags parameter accepts REMAP_FILE_* flags. If REMAP_FILE_DEDUP is set then the implementation must only remap if the requested file ranges have identical contents. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 124a05662fc2..771a961d77ad 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3247,9 +3247,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_remap_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len, - unsigned int remap_flags); +loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bfd99c66723e..b0c513e10977 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4328,10 +4328,12 @@ out_unlock: return ret; } -int btrfs_remap_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, u64 len, +loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { + int ret; + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; @@ -4349,10 +4351,11 @@ int btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; } - return btrfs_extent_same(src, off, len, dst, destoff); + ret = btrfs_extent_same(src, off, len, dst, destoff); + } else { + ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); } - - return btrfs_clone_files(dst_file, src_file, off, len, destoff); + return ret < 0 ? ret : len; } static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e8144d0dcde2..5ca71c6c8be2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -975,8 +975,8 @@ const struct inode_operations cifs_symlink_inode_ops = { .listxattr = cifs_listxattr, }; -static int cifs_remap_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, u64 len, +static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { struct inode *src_inode = file_inode(src_file); @@ -1029,7 +1029,7 @@ static int cifs_remap_file_range(struct file *src_file, loff_t off, unlock_two_nondirectories(src_inode, target_inode); out: free_xid(xid); - return rc; + return rc < 0 ? rc : len; } ssize_t cifs_file_copychunk_range(unsigned int xid, diff --git a/fs/ioctl.c b/fs/ioctl.c index 2005529af560..72537b68c272 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -223,6 +223,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { struct fd src_file = fdget(srcfd); + loff_t cloned; int ret; if (!src_file.file) @@ -230,7 +231,14 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ret = -EXDEV; if (src_file.file->f_path.mnt != dst_file->f_path.mnt) goto fdput; - ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); + cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff, + olen); + if (cloned < 0) + ret = cloned; + else if (olen && cloned != olen) + ret = -EINVAL; + else + ret = 0; fdput: fdput(src_file); return ret; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index ae5780ce41dc..46d691ba04bc 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -180,8 +180,8 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t return nfs42_proc_allocate(filep, offset, len); } -static int nfs42_remap_file_range(struct file *src_file, loff_t src_off, - struct file *dst_file, loff_t dst_off, u64 count, +static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, loff_t count, unsigned int remap_flags) { struct inode *dst_inode = file_inode(dst_file); @@ -244,7 +244,7 @@ out_unlock: inode_unlock(src_inode); } out: - return ret; + return ret < 0 ? ret : count; } #endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b53e76391e52..ac6cb6101cbe 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -541,8 +541,12 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { - return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, - count)); + loff_t cloned; + + cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count); + if (count && cloned != count) + cloned = -EINVAL; + return nfserrno(cloned < 0 ? cloned : 0); } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9809b0e5746f..fbaeafe44b5f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2527,18 +2527,18 @@ out: return offset; } -static int ocfs2_remap_file_range(struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - unsigned int remap_flags) +static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { + int ret; + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, remap_flags); + ret = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, remap_flags); + return ret < 0 ? ret : len; } const struct inode_operations ocfs2_file_iops = { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index df9781567ec0..6a42c04ac0ab 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4824,7 +4824,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, + loff_t len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index d2c5f526edff..eb65c1d0843c 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -119,7 +119,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, + loff_t len, unsigned int remap_flags); #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 1cc797a08a5b..8750b7235516 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -125,6 +125,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) struct file *new_file; loff_t old_pos = 0; loff_t new_pos = 0; + loff_t cloned; int error = 0; if (len == 0) @@ -141,11 +142,10 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) } /* Try to use clone_file_range to clone up within the same fs */ - error = do_clone_file_range(old_file, 0, new_file, 0, len); - if (!error) + cloned = do_clone_file_range(old_file, 0, new_file, 0, len); + if (cloned == len) goto out; /* Couldn't clone, so now we try to copy the data */ - error = 0; /* FIXME: copy up sparse files efficiently */ while (len) { diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index fffb36fd5920..6c3fec6168e9 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -434,14 +434,14 @@ enum ovl_copyop { OVL_DEDUPE, }; -static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, +static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, unsigned int flags, enum ovl_copyop op) + loff_t len, unsigned int flags, enum ovl_copyop op) { struct inode *inode_out = file_inode(file_out); struct fd real_in, real_out; const struct cred *old_cred; - ssize_t ret; + loff_t ret; ret = ovl_real_fdget(file_out, &real_out); if (ret) @@ -489,9 +489,9 @@ static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, OVL_COPY); } -static int ovl_remap_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 len, unsigned int remap_flags) +static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { enum ovl_copyop op; diff --git a/fs/read_write.c b/fs/read_write.c index b61bd3fc7154..356641afa487 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1589,10 +1589,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * more efficient if both clone and copy are supported (e.g. NFS). */ if (file_in->f_op->remap_file_range) { - ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, 0); - if (ret == 0) { - ret = len; + loff_t cloned; + + cloned = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, + min_t(loff_t, MAX_RW_COUNT, len), 0); + if (cloned > 0) { + ret = cloned; goto done; } } @@ -1686,11 +1689,12 @@ out2: return ret; } -static int remap_verify_area(struct file *file, loff_t pos, u64 len, bool write) +static int remap_verify_area(struct file *file, loff_t pos, loff_t len, + bool write) { struct inode *inode = file_inode(file); - if (unlikely(pos < 0)) + if (unlikely(pos < 0 || len < 0)) return -EINVAL; if (unlikely((loff_t) (pos + len) < 0)) @@ -1721,7 +1725,7 @@ static int remap_verify_area(struct file *file, loff_t pos, u64 len, bool write) static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, loff_t pos_out, - u64 *len, + loff_t *len, unsigned int remap_flags) { u64 blkmask = i_blocksize(inode_in) - 1; @@ -1747,7 +1751,7 @@ static int generic_remap_check_len(struct inode *inode_in, */ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *len, unsigned int remap_flags) + loff_t *len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1843,12 +1847,12 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(generic_remap_file_range_prep); -int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t len) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); - int ret; + loff_t ret; if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; @@ -1881,19 +1885,19 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, len, 0); - if (!ret) { - fsnotify_access(file_in); - fsnotify_modify(file_out); - } + if (ret < 0) + return ret; + fsnotify_access(file_in); + fsnotify_modify(file_out); return ret; } EXPORT_SYMBOL(do_clone_file_range); -int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t len) { - int ret; + loff_t ret; file_start_write(file_out); ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); @@ -1999,10 +2003,11 @@ out_error: } EXPORT_SYMBOL(vfs_dedupe_file_range_compare); -int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, u64 len) +loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len) { - s64 ret; + loff_t ret; ret = mnt_want_write_file(dst_file); if (ret) @@ -2051,7 +2056,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) int i; int ret; u16 count = same->dest_count; - int deduped; + loff_t deduped; if (!(file->f_mode & FMODE_READ)) return -EINVAL; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 20314eb4677a..38fde4e11714 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -919,20 +919,23 @@ out_unlock: return error; } -STATIC int +STATIC loff_t xfs_file_remap_range( struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, + loff_t len, unsigned int remap_flags) { + int ret; + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, + ret = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, len, remap_flags); + return ret < 0 ? ret : len; } STATIC int diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 2d7dd8b28d7c..3dbe5fb7e9c0 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1296,7 +1296,7 @@ xfs_reflink_remap_prep( loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *len, + loff_t *len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); @@ -1387,7 +1387,7 @@ xfs_reflink_remap_range( loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, + loff_t len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 6f82d628bf17..c3c46c276fe1 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -28,7 +28,7 @@ extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len, + struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *has_shared); diff --git a/include/linux/fs.h b/include/linux/fs.h index c5435ca81132..c72d8c3c065a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1777,9 +1777,9 @@ struct file_operations { #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - int (*remap_file_range)(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 len, unsigned int remap_flags); + loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); } __randomize_layout; @@ -1844,19 +1844,22 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *count, unsigned int remap_flags); -extern int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); -extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); + loff_t *count, + unsigned int remap_flags); +extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len); +extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len); extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); -extern int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, - u64 len); +extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len); struct super_operations { @@ -2986,7 +2989,7 @@ extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - uint64_t *count, unsigned int remap_flags); + loff_t *count, unsigned int remap_flags); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/filemap.c b/mm/filemap.c index 410dc58f7b16..e9091d731f84 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2994,7 +2994,7 @@ EXPORT_SYMBOL(generic_write_checks); */ int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - uint64_t *req_count, unsigned int remap_flags) + loff_t *req_count, unsigned int remap_flags) { struct inode *inode_in = file_in->f_mapping->host; struct inode *inode_out = file_out->f_mapping->host; -- cgit v1.2.3-71-gd317 From 452ce65951a2f0719e4e119ecca134c06cfe22ee Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:56 +1100 Subject: vfs: plumb remap flags through the vfs clone functions Plumb a remap_flags argument through the {do,vfs}_clone_file_range functions so that clone can take advantage of it. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/ioctl.c | 2 +- fs/nfsd/vfs.c | 2 +- fs/overlayfs/copy_up.c | 2 +- fs/overlayfs/file.c | 6 +++--- fs/read_write.c | 13 +++++++++---- include/linux/fs.h | 4 ++-- 6 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/fs/ioctl.c b/fs/ioctl.c index 72537b68c272..505275ec5596 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -232,7 +232,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, if (src_file.file->f_path.mnt != dst_file->f_path.mnt) goto fdput; cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff, - olen); + olen, 0); if (cloned < 0) ret = cloned; else if (olen && cloned != olen) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ac6cb6101cbe..726fc5b2b27a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -543,7 +543,7 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, { loff_t cloned; - cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count); + cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); if (count && cloned != count) cloned = -EINVAL; return nfserrno(cloned < 0 ? cloned : 0); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 8750b7235516..5f82fece64a0 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -142,7 +142,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) } /* Try to use clone_file_range to clone up within the same fs */ - cloned = do_clone_file_range(old_file, 0, new_file, 0, len); + cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); if (cloned == len) goto out; /* Couldn't clone, so now we try to copy the data */ diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 6c3fec6168e9..0393815c8971 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -462,7 +462,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, case OVL_CLONE: ret = vfs_clone_file_range(real_in.file, pos_in, - real_out.file, pos_out, len); + real_out.file, pos_out, len, flags); break; case OVL_DEDUPE: @@ -512,8 +512,8 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, !ovl_inode_upper(file_inode(file_out)))) return -EPERM; - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, - op); + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, + remap_flags, op); } const struct file_operations ovl_file_operations = { diff --git a/fs/read_write.c b/fs/read_write.c index 356641afa487..0d1ac1b9bc22 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1848,12 +1848,15 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, EXPORT_SYMBOL(generic_remap_file_range_prep); loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, loff_t len) + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); loff_t ret; + WARN_ON_ONCE(remap_flags); + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) @@ -1884,7 +1887,7 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, return ret; ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, 0); + file_out, pos_out, len, remap_flags); if (ret < 0) return ret; @@ -1895,12 +1898,14 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, EXPORT_SYMBOL(do_clone_file_range); loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, loff_t len) + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { loff_t ret; file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, + remap_flags); file_end_write(file_out); return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index c72d8c3c065a..1c5e55d2a67d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1848,10 +1848,10 @@ extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, unsigned int remap_flags); extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - loff_t len); + loff_t len, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - loff_t len); + loff_t len, unsigned int remap_flags); extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same); -- cgit v1.2.3-71-gd317 From df3658361951e17364f1e1c3fa92862a990ad8bd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:42:03 +1100 Subject: vfs: plumb remap flags through the vfs dedupe functions Plumb a remap_flags argument through the vfs_dedupe_file_range_one functions so that dedupe can take advantage of it. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/overlayfs/file.c | 3 ++- fs/read_write.c | 9 ++++++--- include/linux/fs.h | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 0393815c8971..84dd957efa24 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -467,7 +467,8 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, case OVL_DEDUPE: ret = vfs_dedupe_file_range_one(real_in.file, pos_in, - real_out.file, pos_out, len); + real_out.file, pos_out, len, + flags); break; } revert_creds(old_cred); diff --git a/fs/read_write.c b/fs/read_write.c index 0d1ac1b9bc22..ea30666013b0 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -2010,10 +2010,12 @@ EXPORT_SYMBOL(vfs_dedupe_file_range_compare); loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, - loff_t len) + loff_t len, unsigned int remap_flags) { loff_t ret; + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP)); + ret = mnt_want_write_file(dst_file); if (ret) return ret; @@ -2044,7 +2046,7 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, } ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, - dst_pos, len, REMAP_FILE_DEDUP); + dst_pos, len, remap_flags | REMAP_FILE_DEDUP); out_drop_write: mnt_drop_write_file(dst_file); @@ -2112,7 +2114,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } deduped = vfs_dedupe_file_range_one(file, off, dst_file, - info->dest_offset, len); + info->dest_offset, len, + 0); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) diff --git a/include/linux/fs.h b/include/linux/fs.h index 1c5e55d2a67d..544ab5083b48 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1859,7 +1859,7 @@ extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, - loff_t len); + loff_t len, unsigned int remap_flags); struct super_operations { -- cgit v1.2.3-71-gd317 From eca3654e3cc7d93e9734d0fa96cfb15c7f356244 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:42:10 +1100 Subject: vfs: enable remap callers that can handle short operations Plumb in a remap flag that enables the filesystem remap handler to shorten remapping requests for callers that can handle it. Now copy_file_range can report partial success (in case we run up against alignment problems, resource limits, etc.). We also enable CAN_SHORTEN for fideduperange to maintain existing userspace-visible behavior where xfs/btrfs shorten the dedupe range to avoid stale post-eof data exposure. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- Documentation/filesystems/vfs.txt | 4 +++- fs/read_write.c | 28 ++++++++++++++++++++-------- include/linux/fs.h | 5 +++-- mm/filemap.c | 11 +++++++---- 4 files changed, 33 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 1bd2919deaca..5f71a252e2e0 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -970,7 +970,9 @@ otherwise noted. negative error code if errors occurred before any bytes were remapped. The remap_flags parameter accepts REMAP_FILE_* flags. If REMAP_FILE_DEDUP is set then the implementation must only remap if the - requested file ranges have identical contents. + requested file ranges have identical contents. If REMAP_CAN_SHORTEN is + set, the caller is ok with the implementation shortening the request + length to satisfy alignment or EOF requirements (or any other reason). fadvise: possibly called by the fadvise64() system call. diff --git a/fs/read_write.c b/fs/read_write.c index ea30666013b0..c0bcc1a20650 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1593,7 +1593,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, cloned = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, - min_t(loff_t, MAX_RW_COUNT, len), 0); + min_t(loff_t, MAX_RW_COUNT, len), + REMAP_FILE_CAN_SHORTEN); if (cloned > 0) { ret = cloned; goto done; @@ -1721,6 +1722,8 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, * can't meaningfully compare post-EOF contents. * * For clone we only link a partial EOF block above the destination file's EOF. + * + * Shorten the request if possible. */ static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, @@ -1729,16 +1732,24 @@ static int generic_remap_check_len(struct inode *inode_in, unsigned int remap_flags) { u64 blkmask = i_blocksize(inode_in) - 1; + loff_t new_len = *len; if ((*len & blkmask) == 0) return 0; - if (remap_flags & REMAP_FILE_DEDUP) - *len &= ~blkmask; - else if (pos_out + *len < i_size_read(inode_out)) - return -EINVAL; + if ((remap_flags & REMAP_FILE_DEDUP) || + pos_out + *len < i_size_read(inode_out)) + new_len &= ~blkmask; - return 0; + if (new_len == *len) + return 0; + + if (remap_flags & REMAP_FILE_CAN_SHORTEN) { + *len = new_len; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; } /* @@ -2014,7 +2025,8 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, { loff_t ret; - WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP)); + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | + REMAP_FILE_CAN_SHORTEN)); ret = mnt_want_write_file(dst_file); if (ret) @@ -2115,7 +2127,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) deduped = vfs_dedupe_file_range_one(file, off, dst_file, info->dest_offset, len, - 0); + REMAP_FILE_CAN_SHORTEN); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) diff --git a/include/linux/fs.h b/include/linux/fs.h index 544ab5083b48..34c22d695011 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1727,8 +1727,10 @@ struct block_device_operations; * See Documentation/filesystems/vfs.txt for more details about this call. * * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) + * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request */ #define REMAP_FILE_DEDUP (1 << 0) +#define REMAP_FILE_CAN_SHORTEN (1 << 1) /* * These flags signal that the caller is ok with altering various aspects of @@ -1736,9 +1738,8 @@ struct block_device_operations; * implementation; the vfs remap helper functions can take advantage of them. * Flags in this category exist to preserve the quirky behavior of the hoisted * btrfs clone/dedupe ioctls. - * There are no flags yet, but subsequent commits will add some. */ -#define REMAP_FILE_ADVISORY (0) +#define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) struct iov_iter; diff --git a/mm/filemap.c b/mm/filemap.c index e9091d731f84..1775d4ad3317 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3045,8 +3045,7 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in, bcount = ALIGN(size_in, bs) - pos_in; } else { if (!IS_ALIGNED(count, bs)) - return -EINVAL; - + count = ALIGN_DOWN(count, bs); bcount = count; } @@ -3056,10 +3055,14 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in, pos_out < pos_in + bcount) return -EINVAL; - /* For now we don't support changing the length. */ - if (*req_count != count) + /* + * We shortened the request but the caller can't deal with that, so + * bounce the request back to userspace. + */ + if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) return -EINVAL; + *req_count = count; return 0; } -- cgit v1.2.3-71-gd317 From c32e5f39953fa6bbff35c655bdcb7b3128f1e79f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:42:17 +1100 Subject: vfs: hide file range comparison function There are no callers of vfs_dedupe_file_range_compare, so we might as well make it a static helper and remove the export. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/read_write.c | 187 ++++++++++++++++++++++++++--------------------------- include/linux/fs.h | 3 - 2 files changed, 91 insertions(+), 99 deletions(-) (limited to 'include/linux') diff --git a/fs/read_write.c b/fs/read_write.c index c0bcc1a20650..e4d295d0d236 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1752,6 +1752,97 @@ static int generic_remap_check_len(struct inode *inode_in, return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; } +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct page *page; + + page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} + /* * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the @@ -1923,102 +2014,6 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(vfs_clone_file_range); -/* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) -{ - struct address_space *mapping; - struct page *page; - pgoff_t n; - - n = offset >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - return page; -} - -/* - * Compare extents of two files to see if they are the same. - * Caller must have locked both inodes to prevent write races. - */ -int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same) -{ - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - if (cmp_len <= 0) - goto out_error; - - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - unlock_page(src_page); - put_page(src_page); - goto out_error; - } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); - - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; - - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); - put_page(dest_page); - put_page(src_page); - - if (!same) - break; - - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; - return 0; - -out_error: - return error; -} -EXPORT_SYMBOL(vfs_dedupe_file_range_compare); - loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags) diff --git a/include/linux/fs.h b/include/linux/fs.h index 34c22d695011..346036a84f18 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1853,9 +1853,6 @@ extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); -extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, -- cgit v1.2.3-71-gd317 From 966c37f2d77eb44d47af8e919267b1ba675b2eca Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 26 Oct 2018 11:30:35 +0800 Subject: ipv4/igmp: fix v1/v2 switchback timeout based on rfc3376, 8.12 Similiar with ipv6 mcast commit 89225d1ce6af3 ("net: ipv6: mld: fix v1/v2 switchback timeout to rfc3810, 9.12.") i) RFC3376 8.12. Older Version Querier Present Timeout says: The Older Version Querier Interval is the time-out for transitioning a host back to IGMPv3 mode once an older version query is heard. When an older version query is received, hosts set their Older Version Querier Present Timer to Older Version Querier Interval. This value MUST be ((the Robustness Variable) times (the Query Interval in the last Query received)) plus (one Query Response Interval). Currently we only use a hardcode value IGMP_V1/v2_ROUTER_PRESENT_TIMEOUT. Fix it by adding two new items mr_qi(Query Interval) and mr_qri(Query Response Interval) in struct in_device. Now we can calculate the switchback time via (mr_qrv * mr_qi) + mr_qri. We need update these values when receive IGMPv3 queries. Reported-by: Ying Xu Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 4 +++- net/ipv4/igmp.c | 53 +++++++++++++++++++++++++++++++--------------- 2 files changed, 39 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index c759d1cbcedd..a64f21a97369 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -37,7 +37,9 @@ struct in_device { unsigned long mr_v1_seen; unsigned long mr_v2_seen; unsigned long mr_maxdelay; - unsigned char mr_qrv; + unsigned long mr_qi; /* Query Interval */ + unsigned long mr_qri; /* Query Response Interval */ + unsigned char mr_qrv; /* Query Robustness Variable */ unsigned char mr_gq_running; unsigned char mr_ifc_count; struct timer_list mr_gq_timer; /* general query timer */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4da39446da2d..765b2b32c4a4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -111,13 +111,10 @@ #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ -#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ) -#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ) #define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ) #define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ) +#define IGMP_QUERY_INTERVAL (125*HZ) #define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) -#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2 - #define IGMP_INITIAL_REPORT_DELAY (1) @@ -935,13 +932,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + - IGMP_V1_ROUTER_PRESENT_TIMEOUT; + (in_dev->mr_qrv * in_dev->mr_qi) + + in_dev->mr_qri; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + - IGMP_V2_ROUTER_PRESENT_TIMEOUT; + (in_dev->mr_qrv * in_dev->mr_qi) + + in_dev->mr_qri; } /* cancel the interface change timer */ in_dev->mr_ifc_count = 0; @@ -981,8 +980,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ in_dev->mr_maxdelay = max_delay; - if (ih3->qrv) - in_dev->mr_qrv = ih3->qrv; + + /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently + * received value was zero, use the default or statically + * configured value. + */ + in_dev->mr_qrv = ih3->qrv ?: net->ipv4.sysctl_igmp_qrv; + in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; + + /* RFC3376, 8.3. Query Response Interval: + * The number of seconds represented by the [Query Response + * Interval] must be less than the [Query Interval]. + */ + if (in_dev->mr_qri >= in_dev->mr_qi) + in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; + if (!group) { /* general query */ if (ih3->nsrcs) return true; /* no sources allowed */ @@ -1723,18 +1735,30 @@ void ip_mc_down(struct in_device *in_dev) ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } -void ip_mc_init_dev(struct in_device *in_dev) -{ #ifdef CONFIG_IP_MULTICAST +static void ip_mc_reset(struct in_device *in_dev) +{ struct net *net = dev_net(in_dev->dev); + + in_dev->mr_qi = IGMP_QUERY_INTERVAL; + in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; +} +#else +static void ip_mc_reset(struct in_device *in_dev) +{ +} #endif + +void ip_mc_init_dev(struct in_device *in_dev) +{ ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0); timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0); - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif + ip_mc_reset(in_dev); spin_lock_init(&in_dev->mc_tomb_lock); } @@ -1744,15 +1768,10 @@ void ip_mc_init_dev(struct in_device *in_dev) void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; -#ifdef CONFIG_IP_MULTICAST - struct net *net = dev_net(in_dev->dev); -#endif ASSERT_RTNL(); -#ifdef CONFIG_IP_MULTICAST - in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; -#endif + ip_mc_reset(in_dev); ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) { -- cgit v1.2.3-71-gd317 From bb58fd7eeffc9bd5d6e2a16cbf0e9e259f8d09f2 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Fri, 19 Oct 2018 14:11:03 -0700 Subject: i40e: Update status codes Add a few new status code which will be used by the ice driver, and rename a few to make them more consistent. Error code are mapped to similar values as in i40e_status.h, so as to be compatible with older VF drivers not using this status enum. Signed-off-by: Mitch Williams Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 2 +- include/linux/avf/virtchnl.h | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 81b0e1f8d14b..ac5698ed0b11 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3674,7 +3674,7 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode, dev_err(&pf->pdev->dev, "Invalid message from VF %d, opcode %d, len %d\n", local_vf_id, v_opcode, msglen); switch (ret) { - case VIRTCHNL_ERR_PARAM: + case VIRTCHNL_STATUS_ERR_PARAM: return -EPERM; default: return -EINVAL; diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 2c9756bd9c4c..b2488055fd1d 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -62,13 +62,19 @@ /* Error Codes */ enum virtchnl_status_code { VIRTCHNL_STATUS_SUCCESS = 0, - VIRTCHNL_ERR_PARAM = -5, + VIRTCHNL_STATUS_ERR_PARAM = -5, + VIRTCHNL_STATUS_ERR_NO_MEMORY = -18, VIRTCHNL_STATUS_ERR_OPCODE_MISMATCH = -38, VIRTCHNL_STATUS_ERR_CQP_COMPL_ERROR = -39, VIRTCHNL_STATUS_ERR_INVALID_VF_ID = -40, - VIRTCHNL_STATUS_NOT_SUPPORTED = -64, + VIRTCHNL_STATUS_ERR_ADMIN_QUEUE_ERROR = -53, + VIRTCHNL_STATUS_ERR_NOT_SUPPORTED = -64, }; +/* Backward compatibility */ +#define VIRTCHNL_ERR_PARAM VIRTCHNL_STATUS_ERR_PARAM +#define VIRTCHNL_STATUS_NOT_SUPPORTED VIRTCHNL_STATUS_ERR_NOT_SUPPORTED + #define VIRTCHNL_LINK_SPEED_100MB_SHIFT 0x1 #define VIRTCHNL_LINK_SPEED_1000MB_SHIFT 0x2 #define VIRTCHNL_LINK_SPEED_10GB_SHIFT 0x3 @@ -831,7 +837,7 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: default: - return VIRTCHNL_ERR_PARAM; + return VIRTCHNL_STATUS_ERR_PARAM; } /* few more checks */ if (err_msg_format || valid_len != msglen) -- cgit v1.2.3-71-gd317 From a324e9396ca3d00e1101476ba067b412e0aba232 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 31 Oct 2018 19:20:20 +0100 Subject: EDAC, skx: Fix randconfig builds The driver depends on the ADXL component glue and selects it. However, ADXL itself implicitly depends on ACPI and in nonsensical randconfig builds like this: # CONFIG_ACPI is not set CONFIG_ACPI_ADXL=y where ACPI is not enabled, the build fails with: drivers/edac/skx_edac.o: In function `skx_mce_check_error': skx_edac.c:(.text+0xab): undefined reference to `adxl_decode' drivers/edac/skx_edac.o: In function `skx_init': skx_edac.c:(.init.text+0x8bf): undefined reference to `adxl_get_component_names' make: *** [vmlinux] Error 1 Add stubs for that case so that the build succeeds. CONFIG_ACPI=n doesn't make any sense for real configurations but this fix will at least silence randconfig builds. Signed-off-by: Borislav Petkov Acked-by: Tony Luck Cc: "Rafael J. Wysocki" --- drivers/edac/Kconfig | 2 +- include/linux/adxl.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index ffd349c12479..68e479b4d9c9 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -234,7 +234,7 @@ config EDAC_SKX depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y select DMI - select ACPI_ADXL + select ACPI_ADXL if ACPI help Support for error detection and correction the Intel Skylake server Integrated Memory Controllers. If your diff --git a/include/linux/adxl.h b/include/linux/adxl.h index 2a629acb4c3f..2d29f55923e3 100644 --- a/include/linux/adxl.h +++ b/include/linux/adxl.h @@ -7,7 +7,12 @@ #ifndef _LINUX_ADXL_H #define _LINUX_ADXL_H +#ifdef CONFIG_ACPI_ADXL const char * const *adxl_get_component_names(void); int adxl_decode(u64 addr, u64 component_values[]); +#else +static inline const char * const *adxl_get_component_names(void) { return NULL; } +static inline int adxl_decode(u64 addr, u64 component_values[]) { return -EOPNOTSUPP; } +#endif #endif /* _LINUX_ADXL_H */ -- cgit v1.2.3-71-gd317 From 439cd39ea136d2c026805264d58a91f36b6b64ca Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sat, 14 Jul 2018 21:59:43 +0200 Subject: netfilter: ipset: list:set: Decrease refcount synchronously on deletion and replace Commit 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel") postponed decreasing set reference counters to the RCU callback. An 'ipset del' command can terminate before the RCU grace period is elapsed, and if sets are listed before then, the reference counter shown in userspace will be wrong: # ipset create h hash:ip; ipset create l list:set; ipset add l # ipset del l h; ipset list h Name: h Type: hash:ip Revision: 4 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 88 References: 1 Number of entries: 0 Members: # sleep 1; ipset list h Name: h Type: hash:ip Revision: 4 Header: family inet hashsize 1024 maxelem 65536 Size in memory: 88 References: 0 Number of entries: 0 Members: Fix this by making the reference count update synchronous again. As a result, when sets are listed, ip_set_name_byindex() might now fetch a set whose reference count is already zero. Instead of relying on the reference count to protect against concurrent set renaming, grab ip_set_ref_lock as reader and copy the name, while holding the same lock in ip_set_rename() as writer instead. Reported-by: Li Shuang Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel") Signed-off-by: Stefano Brivio Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 2 +- net/netfilter/ipset/ip_set_core.c | 23 +++++++++++------------ net/netfilter/ipset/ip_set_list_set.c | 17 +++++++++++------ 3 files changed, 23 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 34fc80f3eb90..1d100efe74ec 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -314,7 +314,7 @@ enum { extern ip_set_id_t ip_set_get_byname(struct net *net, const char *name, struct ip_set **set); extern void ip_set_put_byindex(struct net *net, ip_set_id_t index); -extern const char *ip_set_name_byindex(struct net *net, ip_set_id_t index); +extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name); extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index); extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index bc4bd247bb7d..fa15a831aeee 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -693,21 +693,20 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index) EXPORT_SYMBOL_GPL(ip_set_put_byindex); /* Get the name of a set behind a set index. - * We assume the set is referenced, so it does exist and - * can't be destroyed. The set cannot be renamed due to - * the referencing either. - * + * Set itself is protected by RCU, but its name isn't: to protect against + * renaming, grab ip_set_ref_lock as reader (see ip_set_rename()) and copy the + * name. */ -const char * -ip_set_name_byindex(struct net *net, ip_set_id_t index) +void +ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) { - const struct ip_set *set = ip_set_rcu_get(net, index); + struct ip_set *set = ip_set_rcu_get(net, index); BUG_ON(!set); - BUG_ON(set->ref == 0); - /* Referenced, so it's safe */ - return set->name; + read_lock_bh(&ip_set_ref_lock); + strncpy(name, set->name, IPSET_MAXNAMELEN); + read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); @@ -1153,7 +1152,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, if (!set) return -ENOENT; - read_lock_bh(&ip_set_ref_lock); + write_lock_bh(&ip_set_ref_lock); if (set->ref != 0) { ret = -IPSET_ERR_REFERENCED; goto out; @@ -1170,7 +1169,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, strncpy(set->name, name2, IPSET_MAXNAMELEN); out: - read_unlock_bh(&ip_set_ref_lock); + write_unlock_bh(&ip_set_ref_lock); return ret; } diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 072a658fde04..4eef55da0878 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -148,9 +148,7 @@ __list_set_del_rcu(struct rcu_head * rcu) { struct set_elem *e = container_of(rcu, struct set_elem, rcu); struct ip_set *set = e->set; - struct list_set *map = set->data; - ip_set_put_byindex(map->net, e->id); ip_set_ext_destroy(set, e); kfree(e); } @@ -158,15 +156,21 @@ __list_set_del_rcu(struct rcu_head * rcu) static inline void list_set_del(struct ip_set *set, struct set_elem *e) { + struct list_set *map = set->data; + set->elements--; list_del_rcu(&e->list); + ip_set_put_byindex(map->net, e->id); call_rcu(&e->rcu, __list_set_del_rcu); } static inline void -list_set_replace(struct set_elem *e, struct set_elem *old) +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { + struct list_set *map = set->data; + list_replace_rcu(&old->list, &e->list); + ip_set_put_byindex(map->net, old->id); call_rcu(&old->rcu, __list_set_del_rcu); } @@ -298,7 +302,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, INIT_LIST_HEAD(&e->list); list_set_init_extensions(set, ext, e); if (n) - list_set_replace(e, n); + list_set_replace(set, e, n); else if (next) list_add_tail_rcu(&e->list, &next->list); else if (prev) @@ -486,6 +490,7 @@ list_set_list(const struct ip_set *set, const struct list_set *map = set->data; struct nlattr *atd, *nested; u32 i = 0, first = cb->args[IPSET_CB_ARG0]; + char name[IPSET_MAXNAMELEN]; struct set_elem *e; int ret = 0; @@ -504,8 +509,8 @@ list_set_list(const struct ip_set *set, nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; - if (nla_put_string(skb, IPSET_ATTR_NAME, - ip_set_name_byindex(map->net, e->id))) + ip_set_name_byindex(map->net, e->id, name); + if (nla_put_string(skb, IPSET_ATTR_NAME, name)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; -- cgit v1.2.3-71-gd317 From 0962590e553331db2cc0aef2dc35c57f6300dbbe Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 1 Nov 2018 00:05:52 +0100 Subject: bpf: fix partial copy of map_ptr when dst is scalar ALU operations on pointers such as scalar_reg += map_value_ptr are handled in adjust_ptr_min_max_vals(). Problem is however that map_ptr and range in the register state share a union, so transferring state through dst_reg->range = ptr_reg->range is just buggy as any new map_ptr in the dst_reg is then truncated (or null) for subsequent checks. Fix this by adding a raw member and use it for copying state over to dst_reg. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Daniel Borkmann Cc: Edward Cree Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +++ kernel/bpf/verifier.c | 10 ++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9e8056ec20fa..d93e89761a8b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -51,6 +51,9 @@ struct bpf_reg_state { * PTR_TO_MAP_VALUE_OR_NULL */ struct bpf_map *map_ptr; + + /* Max size from any of the above. */ + unsigned long raw; }; /* Fixed part of pointer offset, pointer types only */ s32 off; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 171a2c88e77d..774fa40a32ae 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3046,7 +3046,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = umax_ptr; dst_reg->var_off = ptr_reg->var_off; dst_reg->off = ptr_reg->off + smin_val; - dst_reg->range = ptr_reg->range; + dst_reg->raw = ptr_reg->raw; break; } /* A new variable offset is created. Note that off_reg->off @@ -3076,10 +3076,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; + dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ - dst_reg->range = 0; + dst_reg->raw = 0; } break; case BPF_SUB: @@ -3108,7 +3109,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = ptr_reg->var_off; dst_reg->id = ptr_reg->id; dst_reg->off = ptr_reg->off - smin_val; - dst_reg->range = ptr_reg->range; + dst_reg->raw = ptr_reg->raw; break; } /* A new variable offset is created. If the subtrahend is known @@ -3134,11 +3135,12 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; + dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) - dst_reg->range = 0; + dst_reg->raw = 0; } break; case BPF_AND: -- cgit v1.2.3-71-gd317 From a846446b1914d1e3d996d657754f43fde89bab51 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Fri, 12 Oct 2018 14:42:52 +0100 Subject: x86/compat: Adjust in_compat_syscall() to generic code under !COMPAT The result of in_compat_syscall() can be pictured as: x86 platform: --------------------------------------------------- | Arch\syscall | 64-bit | ia32 | x32 | |-------------------------------------------------| | x86_64 | false | true | true | |-------------------------------------------------| | i686 | | | | --------------------------------------------------- Other platforms: ------------------------------------------- | Arch\syscall | 64-bit | compat | |-----------------------------------------| | 64-bit | false | true | |-----------------------------------------| | 32-bit(?) | | | ------------------------------------------- As seen, the result of in_compat_syscall() on generic 32-bit platform differs from i686. There is no reason for in_compat_syscall() == true on native i686. It also easy to misread code if the result on native 32-bit platform differs between arches. Because of that non arch-specific code has many places with: if (IS_ENABLED(CONFIG_COMPAT) && in_compat_syscall()) in different variations. It looks-like the only non-x86 code which uses in_compat_syscall() not under CONFIG_COMPAT guard is in amd/amdkfd. But according to the commit a18069c132cb ("amdkfd: Disable support for 32-bit user processes"), it actually should be disabled on native i686. Rename in_compat_syscall() to in_32bit_syscall() for x86-specific code and make in_compat_syscall() false under !CONFIG_COMPAT. A follow on patch will clean up generic users which were forced to check IS_ENABLED(CONFIG_COMPAT) with in_compat_syscall(). Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Ard Biesheuvel Cc: "David S. Miller" Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: John Stultz Cc: "Kirill A. Shutemov" Cc: Oleg Nesterov Cc: Steffen Klassert Cc: Stephen Boyd Cc: Steven Rostedt Cc: linux-efi@vger.kernel.org Cc: netdev@vger.kernel.org Link: https://lkml.kernel.org/r/20181012134253.23266-2-dima@arista.com --- arch/x86/include/asm/compat.h | 9 ++++++++- arch/x86/include/asm/ftrace.h | 4 +--- arch/x86/kernel/process_64.c | 4 ++-- arch/x86/kernel/sys_x86_64.c | 11 ++++++----- arch/x86/mm/hugetlbpage.c | 4 ++-- arch/x86/mm/mmap.c | 2 +- include/linux/compat.h | 4 ++-- 7 files changed, 22 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index fab4df16a3c4..22c4dfe65992 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -217,11 +217,18 @@ static inline bool in_x32_syscall(void) return false; } -static inline bool in_compat_syscall(void) +static inline bool in_32bit_syscall(void) { return in_ia32_syscall() || in_x32_syscall(); } + +#ifdef CONFIG_COMPAT +static inline bool in_compat_syscall(void) +{ + return in_32bit_syscall(); +} #define in_compat_syscall in_compat_syscall /* override the generic impl */ +#endif struct compat_siginfo; int __copy_siginfo_to_user32(struct compat_siginfo __user *to, diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index c18ed65287d5..cf350639e76d 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -76,9 +76,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name #define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1 static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) { - if (in_compat_syscall()) - return true; - return false; + return in_32bit_syscall(); } #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */ #endif /* !COMPILE_OFFSETS */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 31b4755369f0..0e0b4288a4b2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -701,10 +701,10 @@ static void __set_personality_x32(void) current->mm->context.ia32_compat = TIF_X32; current->personality &= ~READ_IMPLIES_EXEC; /* - * in_compat_syscall() uses the presence of the x32 syscall bit + * in_32bit_syscall() uses the presence of the x32 syscall bit * flag to determine compat status. The x86 mmap() code relies on * the syscall bitness so set x32 syscall bit right here to make - * in_compat_syscall() work during exec(). + * in_32bit_syscall() work during exec(). * * Pretend to come from a x32 execve. */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 6a78d4b36a79..f7476ce23b6e 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -105,7 +105,7 @@ out: static void find_start_end(unsigned long addr, unsigned long flags, unsigned long *begin, unsigned long *end) { - if (!in_compat_syscall() && (flags & MAP_32BIT)) { + if (!in_32bit_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -122,7 +122,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, } *begin = get_mmap_base(1); - if (in_compat_syscall()) + if (in_32bit_syscall()) *end = task_size_32bit(); else *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); @@ -193,7 +193,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; /* for MAP_32BIT mappings we force the legacy mmap base */ - if (!in_compat_syscall() && (flags & MAP_32BIT)) + if (!in_32bit_syscall() && (flags & MAP_32BIT)) goto bottomup; /* requesting a specific address */ @@ -217,9 +217,10 @@ get_unmapped_area: * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. * - * !in_compat_syscall() check to avoid high addresses for x32. + * !in_32bit_syscall() check to avoid high addresses for x32 + * (and make it no op on native i386). */ - if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; info.align_mask = 0; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 00b296617ca4..92e4c4b85bba 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -92,7 +92,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. */ - info.high_limit = in_compat_syscall() ? + info.high_limit = in_32bit_syscall() ? task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); info.align_mask = PAGE_MASK & ~huge_page_mask(h); @@ -116,7 +116,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. */ - if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; info.align_mask = PAGE_MASK & ~huge_page_mask(h); diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 1e95d57760cf..db3165714521 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -166,7 +166,7 @@ unsigned long get_mmap_base(int is_legacy) struct mm_struct *mm = current->mm; #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES - if (in_compat_syscall()) { + if (in_32bit_syscall()) { return is_legacy ? mm->mmap_compat_legacy_base : mm->mmap_compat_base; } diff --git a/include/linux/compat.h b/include/linux/compat.h index d30e4dbd4be2..f1ef24c68fcc 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -1029,9 +1029,9 @@ int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz, #else /* !CONFIG_COMPAT */ #define is_compat_task() (0) -#ifndef in_compat_syscall +/* Ensure no one redefines in_compat_syscall() under !CONFIG_COMPAT */ +#define in_compat_syscall in_compat_syscall static inline bool in_compat_syscall(void) { return false; } -#endif #endif /* CONFIG_COMPAT */ -- cgit v1.2.3-71-gd317 From c3be6577d82a9f0163eb1e2c37a477414d12a209 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 1 Nov 2018 17:51:34 +0000 Subject: SUNRPC: Use atomic(64)_t for seq_send(64) The seq_send & seq_send64 fields in struct krb5_ctx are used as atomically incrementing counters. This is implemented using cmpxchg() & cmpxchg64() to implement what amount to custom versions of atomic_fetch_inc() & atomic64_fetch_inc(). Besides the duplication, using cmpxchg64() has another major drawback in that some 32 bit architectures don't provide it. As such commit 571ed1fd2390 ("SUNRPC: Replace krb5_seq_lock with a lockless scheme") resulted in build failures for some architectures. Change seq_send to be an atomic_t and seq_send64 to be an atomic64_t, then use atomic(64)_* functions to manipulate the values. The atomic64_t type & associated functions are provided even on architectures which lack real 64 bit atomic memory access via CONFIG_GENERIC_ATOMIC64 which uses spinlocks to serialize access. This fixes the build failures for architectures lacking cmpxchg64(). A potential alternative that was raised would be to provide cmpxchg64() on the 32 bit architectures that currently lack it, using spinlocks. However this would provide a version of cmpxchg64() with semantics a little different to the implementations on architectures with real 64 bit atomics - the spinlock-based implementation would only work if all access to the memory used with cmpxchg64() is *always* performed using cmpxchg64(). That is not currently a requirement for users of cmpxchg64(), and making it one seems questionable. As such avoiding cmpxchg64() outside of architecture-specific code seems best, particularly in cases where atomic64_t seems like a better fit anyway. The CONFIG_GENERIC_ATOMIC64 implementation of atomic64_* functions will use spinlocks & so faces the same issue, but with the key difference that the memory backing an atomic64_t ought to always be accessed via the atomic64_* functions anyway making the issue moot. Signed-off-by: Paul Burton Fixes: 571ed1fd2390 ("SUNRPC: Replace krb5_seq_lock with a lockless scheme") Cc: Trond Myklebust Cc: Anna Schumaker Cc: J. Bruce Fields Cc: Jeff Layton Cc: David S. Miller Cc: linux-nfs@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Trond Myklebust --- include/linux/sunrpc/gss_krb5.h | 7 ++----- net/sunrpc/auth_gss/gss_krb5_mech.c | 16 ++++++++++------ net/sunrpc/auth_gss/gss_krb5_seal.c | 28 ++-------------------------- net/sunrpc/auth_gss/gss_krb5_wrap.c | 4 ++-- 4 files changed, 16 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index 69f749afa617..4162de72e95c 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -107,8 +107,8 @@ struct krb5_ctx { u8 Ksess[GSS_KRB5_MAX_KEYLEN]; /* session key */ u8 cksum[GSS_KRB5_MAX_KEYLEN]; s32 endtime; - u32 seq_send; - u64 seq_send64; + atomic_t seq_send; + atomic64_t seq_send64; struct xdr_netobj mech_used; u8 initiator_sign[GSS_KRB5_MAX_KEYLEN]; u8 acceptor_sign[GSS_KRB5_MAX_KEYLEN]; @@ -118,9 +118,6 @@ struct krb5_ctx { u8 acceptor_integ[GSS_KRB5_MAX_KEYLEN]; }; -extern u32 gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx); -extern u64 gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx); - /* The length of the Kerberos GSS token header */ #define GSS_KRB5_TOK_HDR_LEN (16) diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 7bb2514aadd9..71cb29dc86c2 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -275,6 +275,7 @@ out_err: static int gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) { + u32 seq_send; int tmp; p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); @@ -316,9 +317,10 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx) p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); if (IS_ERR(p)) goto out_err; - p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send)); + p = simple_get_bytes(p, end, &seq_send, sizeof(seq_send)); if (IS_ERR(p)) goto out_err; + atomic_set(&ctx->seq_send, seq_send); p = simple_get_netobj(p, end, &ctx->mech_used); if (IS_ERR(p)) goto out_err; @@ -610,6 +612,7 @@ static int gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, gfp_t gfp_mask) { + u64 seq_send64; int keylen; p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); @@ -620,14 +623,15 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); if (IS_ERR(p)) goto out_err; - p = simple_get_bytes(p, end, &ctx->seq_send64, sizeof(ctx->seq_send64)); + p = simple_get_bytes(p, end, &seq_send64, sizeof(seq_send64)); if (IS_ERR(p)) goto out_err; + atomic64_set(&ctx->seq_send64, seq_send64); /* set seq_send for use by "older" enctypes */ - ctx->seq_send = ctx->seq_send64; - if (ctx->seq_send64 != ctx->seq_send) { - dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__, - (unsigned long)ctx->seq_send64, ctx->seq_send); + atomic_set(&ctx->seq_send, seq_send64); + if (seq_send64 != atomic_read(&ctx->seq_send)) { + dprintk("%s: seq_send64 %llx, seq_send %x overflow?\n", __func__, + seq_send64, atomic_read(&ctx->seq_send)); p = ERR_PTR(-EINVAL); goto out_err; } diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index b4adeb06660b..48fe4a591b54 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -123,30 +123,6 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) return krb5_hdr; } -u32 -gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx) -{ - u32 old, seq_send = READ_ONCE(ctx->seq_send); - - do { - old = seq_send; - seq_send = cmpxchg(&ctx->seq_send, old, old + 1); - } while (old != seq_send); - return seq_send; -} - -u64 -gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx) -{ - u64 old, seq_send = READ_ONCE(ctx->seq_send); - - do { - old = seq_send; - seq_send = cmpxchg64(&ctx->seq_send64, old, old + 1); - } while (old != seq_send); - return seq_send; -} - static u32 gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj *token) @@ -177,7 +153,7 @@ gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - seq_send = gss_seq_send_fetch_and_inc(ctx); + seq_send = atomic_fetch_inc(&ctx->seq_send); if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) @@ -205,7 +181,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, /* Set up the sequence number. Now 64-bits in clear * text and w/o direction indicator */ - seq_send_be64 = cpu_to_be64(gss_seq_send64_fetch_and_inc(ctx)); + seq_send_be64 = cpu_to_be64(atomic64_fetch_inc(&ctx->seq_send64)); memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8); if (ctx->initiate) { diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 41cb294cd071..6af6f119d9c1 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -228,7 +228,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - seq_send = gss_seq_send_fetch_and_inc(kctx); + seq_send = atomic_fetch_inc(&kctx->seq_send); /* XXX would probably be more efficient to compute checksum * and encrypt at the same time: */ @@ -475,7 +475,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, *be16ptr++ = 0; be64ptr = (__be64 *)be16ptr; - *be64ptr = cpu_to_be64(gss_seq_send64_fetch_and_inc(kctx)); + *be64ptr = cpu_to_be64(atomic64_fetch_inc(&kctx->seq_send64)); err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages); if (err) -- cgit v1.2.3-71-gd317 From b5f2954d30c77649bce9c27e7a0a94299d9cfdf8 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 1 Nov 2018 17:24:10 -0400 Subject: blkcg: revert blkcg cleanups series This reverts a series committed earlier due to null pointer exception bug report in [1]. It seems there are edge case interactions that I did not consider and will need some time to understand what causes the adverse interactions. The original series can be found in [2] with a follow up series in [3]. [1] https://www.spinics.net/lists/cgroups/msg20719.html [2] https://lore.kernel.org/lkml/20180911184137.35897-1-dennisszhou@gmail.com/ [3] https://lore.kernel.org/lkml/20181020185612.51587-1-dennis@kernel.org/ This reverts the following commits: d459d853c2ed, b2c3fa546705, 101246ec02b5, b3b9f24f5fcc, e2b0989954ae, f0fcb3ec89f3, c839e7a03f92, bdc2491708c4, 74b7c02a9bc1, 5bf9a1f3b4ef, a7b39b4e961c, 07b05bcc3213, 49f4c2dc2b50, 27e6fa996c53 Signed-off-by: Dennis Zhou Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 8 +- block/bfq-cgroup.c | 4 +- block/bfq-iosched.c | 2 +- block/bio.c | 174 +++++++++----------------------- block/blk-cgroup.c | 123 +++++++--------------- block/blk-core.c | 1 - block/blk-iolatency.c | 26 ++++- block/blk-throttle.c | 13 ++- block/bounce.c | 4 +- block/cfq-iosched.c | 4 +- drivers/block/loop.c | 5 +- drivers/md/raid0.c | 2 +- fs/buffer.c | 10 +- fs/ext4/page-io.c | 2 +- include/linux/bio.h | 26 ++--- include/linux/blk-cgroup.h | 145 +++++++++----------------- include/linux/blk_types.h | 1 + include/linux/cgroup.h | 2 - include/linux/writeback.h | 5 +- kernel/cgroup/cgroup.c | 48 ++------- kernel/trace/blktrace.c | 4 +- mm/page_io.c | 2 +- 22 files changed, 208 insertions(+), 403 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index caf36105a1c7..184193bcb262 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1857,10 +1857,8 @@ following two functions. wbc_init_bio(@wbc, @bio) Should be called for each bio carrying writeback data and - associates the bio with the inode's owner cgroup and the - corresponding request queue. This must be called after - a queue (device) has been associated with the bio and - before submission. + associates the bio with the inode's owner cgroup. Can be + called anytime between bio allocation and submission. wbc_account_io(@wbc, @page, @bytes) Should be called for each data segment being written out. @@ -1879,7 +1877,7 @@ the configuration, the bio may be executed at a lower priority and if the writeback session is holding shared resources, e.g. a journal entry, may lead to priority inversion. There is no one easy solution for the problem. Filesystems can try to work around specific problem -cases by skipping wbc_init_bio() or using bio_associate_create_blkg() +cases by skipping wbc_init_bio() or using bio_associate_blkcg() directly. diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index d9a7916ff0ab..9fe5952d117d 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) uint64_t serial_nr; rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bio_blkcg(bio)->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger @@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) goto out; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); + bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6075100f03a5..3a27d31fcda6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4384,7 +4384,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, rcu_read_lock(); - bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); + bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); if (!bfqg) { bfqq = &bfqd->oom_bfqq; goto out; diff --git a/block/bio.c b/block/bio.c index bbfeb4ee2892..4a5a036268fb 100644 --- a/block/bio.c +++ b/block/bio.c @@ -609,9 +609,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkg_association(bio, bio_src); - - blkcg_bio_issue_init(bio); + bio_clone_blkcg_association(bio, bio_src); } EXPORT_SYMBOL(__bio_clone_fast); @@ -1956,151 +1954,69 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP -/** - * bio_associate_blkg - associate a bio with the a blkg - * @bio: target bio - * @blkg: the blkg to associate - * - * This tries to associate @bio with the specified blkg. Association failure - * is handled by walking up the blkg tree. Therefore, the blkg associated can - * be anything between @blkg and the root_blkg. This situation only happens - * when a cgroup is dying and then the remaining bios will spill to the closest - * alive blkg. - * - * A reference will be taken on the @blkg and will be released when @bio is - * freed. - */ -int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) -{ - if (unlikely(bio->bi_blkg)) - return -EBUSY; - bio->bi_blkg = blkg_tryget_closest(blkg); - return 0; -} - -/** - * __bio_associate_blkg_from_css - internal blkg association function - * - * This in the core association function that all association paths rely on. - * A blkg reference is taken which is released upon freeing of the bio. - */ -static int __bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - struct request_queue *q = bio->bi_disk->queue; - struct blkcg_gq *blkg; - int ret; - - rcu_read_lock(); - - if (!css || !css->parent) - blkg = q->root_blkg; - else - blkg = blkg_lookup_create(css_to_blkcg(css), q); - - ret = bio_associate_blkg(bio, blkg); - - rcu_read_unlock(); - return ret; -} - -/** - * bio_associate_blkg_from_css - associate a bio with a specified css - * @bio: target bio - * @css: target css - * - * Associate @bio with the blkg found by combining the css's blkg and the - * request_queue of the @bio. This falls back to the queue's root_blkg if - * the association fails with the css. - */ -int bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - if (unlikely(bio->bi_blkg)) - return -EBUSY; - return __bio_associate_blkg_from_css(bio, css); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); - #ifdef CONFIG_MEMCG /** - * bio_associate_blkg_from_page - associate a bio with the page's blkg + * bio_associate_blkcg_from_page - associate a bio with the page's blkcg * @bio: target bio * @page: the page to lookup the blkcg from * - * Associate @bio with the blkg from @page's owning memcg and the respective - * request_queue. If cgroup_e_css returns NULL, fall back to the queue's - * root_blkg. - * - * Note: this must be called after bio has an associated device. + * Associate @bio with the blkcg from @page's owning memcg. This works like + * every other associate function wrt references. */ -int bio_associate_blkg_from_page(struct bio *bio, struct page *page) +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) { - struct cgroup_subsys_state *css; - int ret; + struct cgroup_subsys_state *blkcg_css; - if (unlikely(bio->bi_blkg)) + if (unlikely(bio->bi_css)) return -EBUSY; if (!page->mem_cgroup) return 0; - - rcu_read_lock(); - - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); - - ret = __bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); - return ret; + blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, + &io_cgrp_subsys); + bio->bi_css = blkcg_css; + return 0; } #endif /* CONFIG_MEMCG */ /** - * bio_associate_create_blkg - associate a bio with a blkg from q - * @q: request_queue where bio is going + * bio_associate_blkcg - associate a bio with the specified blkcg * @bio: target bio + * @blkcg_css: css of the blkcg to associate + * + * Associate @bio with the blkcg specified by @blkcg_css. Block layer will + * treat @bio as if it were issued by a task which belongs to the blkcg. * - * Associate @bio with the blkg found from the bio's css and the request_queue. - * If one is not found, bio_lookup_blkg creates the blkg. This falls back to - * the queue's root_blkg if association fails. + * This function takes an extra reference of @blkcg_css which will be put + * when @bio is released. The caller must own @bio and is responsible for + * synchronizing calls to this function. */ -int bio_associate_create_blkg(struct request_queue *q, struct bio *bio) +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { - struct cgroup_subsys_state *css; - int ret = 0; - - /* someone has already associated this bio with a blkg */ - if (bio->bi_blkg) - return ret; - - rcu_read_lock(); - - css = blkcg_css(); - - ret = __bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); - return ret; + if (unlikely(bio->bi_css)) + return -EBUSY; + css_get(blkcg_css); + bio->bi_css = blkcg_css; + return 0; } +EXPORT_SYMBOL_GPL(bio_associate_blkcg); /** - * bio_reassociate_blkg - reassociate a bio with a blkg from q - * @q: request_queue where bio is going + * bio_associate_blkg - associate a bio with the specified blkg * @bio: target bio + * @blkg: the blkg to associate * - * When submitting a bio, multiple recursive calls to make_request() may occur. - * This causes the initial associate done in blkcg_bio_issue_check() to be - * incorrect and reference the prior request_queue. This performs reassociation - * when this situation happens. + * Associate @bio with the blkg specified by @blkg. This is the queue specific + * blkcg information associated with the @bio, a reference will be taken on the + * @blkg and will be freed when the bio is freed. */ -int bio_reassociate_blkg(struct request_queue *q, struct bio *bio) +int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) { - if (bio->bi_blkg) { - blkg_put(bio->bi_blkg); - bio->bi_blkg = NULL; - } - - return bio_associate_create_blkg(q, bio); + if (unlikely(bio->bi_blkg)) + return -EBUSY; + if (!blkg_try_get(blkg)) + return -ENODEV; + bio->bi_blkg = blkg; + return 0; } /** @@ -2113,6 +2029,10 @@ void bio_disassociate_task(struct bio *bio) put_io_context(bio->bi_ioc); bio->bi_ioc = NULL; } + if (bio->bi_css) { + css_put(bio->bi_css); + bio->bi_css = NULL; + } if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; @@ -2120,16 +2040,16 @@ void bio_disassociate_task(struct bio *bio) } /** - * bio_clone_blkg_association - clone blkg association from src to dst bio + * bio_clone_blkcg_association - clone blkcg association from src to dst bio * @dst: destination bio * @src: source bio */ -void bio_clone_blkg_association(struct bio *dst, struct bio *src) +void bio_clone_blkcg_association(struct bio *dst, struct bio *src) { - if (src->bi_blkg) - bio_associate_blkg(dst, src->bi_blkg); + if (src->bi_css) + WARN_ON(bio_associate_blkcg(dst, src->bi_css)); } -EXPORT_SYMBOL_GPL(bio_clone_blkg_association); +EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 992da5592c6e..c630e02836a8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -84,37 +84,6 @@ static void blkg_free(struct blkcg_gq *blkg) kfree(blkg); } -static void __blkg_release(struct rcu_head *rcu) -{ - struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); - - percpu_ref_exit(&blkg->refcnt); - - /* release the blkcg and parent blkg refs this blkg has been holding */ - css_put(&blkg->blkcg->css); - if (blkg->parent) - blkg_put(blkg->parent); - - wb_congested_put(blkg->wb_congested); - - blkg_free(blkg); -} - -/* - * A group is RCU protected, but having an rcu lock does not mean that one - * can access all the fields of blkg and assume these are valid. For - * example, don't try to follow throtl_data and request queue links. - * - * Having a reference to blkg under an rcu allows accesses to only values - * local to groups like group stats and group rate limits. - */ -static void blkg_release(struct percpu_ref *ref) -{ - struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); - - call_rcu(&blkg->rcu_head, __blkg_release); -} - /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -141,6 +110,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; + atomic_set(&blkg->refcnt, 1); /* root blkg uses @q->root_rl, init rl only for !root blkgs */ if (blkcg != &blkcg_root) { @@ -247,11 +217,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_get(blkg->parent); } - ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0, - GFP_NOWAIT | __GFP_NOWARN); - if (ret) - goto err_cancel_ref; - /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -284,8 +249,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); -err_cancel_ref: - percpu_ref_exit(&blkg->refcnt); err_put_congested: wb_congested_put(wb_congested); err_put_css: @@ -296,7 +259,7 @@ err_free_blkg: } /** - * __blkg_lookup_create - lookup blkg, try to create one if not there + * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @q: request_queue of interest * @@ -305,11 +268,12 @@ err_free_blkg: * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and @q->queue_lock. * - * Returns the blkg or the closest blkg if blkg_create fails as it walks - * down from root. + * Returns pointer to the looked up or created blkg on success, ERR_PTR() + * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not + * dead and bypassing, returns ERR_PTR(-EBUSY). */ -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; @@ -321,7 +285,7 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return q->root_blkg; + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -329,58 +293,23 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, /* * Create blkgs walking down from blkcg_root to @blkcg, so that all - * non-root blkgs have access to their parents. Returns the closest - * blkg to the intended blkg should blkg_create() fail. + * non-root blkgs have access to their parents. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); - struct blkcg_gq *ret_blkg = q->root_blkg; - - while (parent) { - blkg = __blkg_lookup(parent, q, false); - if (blkg) { - /* remember closest blkg */ - ret_blkg = blkg; - break; - } + + while (parent && !__blkg_lookup(parent, q, false)) { pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, q, NULL); - if (IS_ERR(blkg)) - return ret_blkg; - if (pos == blkcg) + if (pos == blkcg || IS_ERR(blkg)) return blkg; } } -/** - * blkg_lookup_create - find or create a blkg - * @blkcg: target block cgroup - * @q: target request_queue - * - * This looks up or creates the blkg representing the unique pair - * of the blkcg and the request_queue. - */ -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) -{ - struct blkcg_gq *blkg = blkg_lookup(blkcg, q); - unsigned long flags; - - if (unlikely(!blkg)) { - spin_lock_irqsave(q->queue_lock, flags); - - blkg = __blkg_lookup_create(blkcg, q); - - spin_unlock_irqrestore(q->queue_lock, flags); - } - - return blkg; -} - static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; @@ -424,7 +353,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ - percpu_ref_kill(&blkg->refcnt); + blkg_put(blkg); } /** @@ -451,6 +380,29 @@ static void blkg_destroy_all(struct request_queue *q) q->root_rl.blkg = NULL; } +/* + * A group is RCU protected, but having an rcu lock does not mean that one + * can access all the fields of blkg and assume these are valid. For + * example, don't try to follow throtl_data and request queue links. + * + * Having a reference to blkg under an rcu allows accesses to only values + * local to groups like group stats and group rate limits. + */ +void __blkg_release_rcu(struct rcu_head *rcu_head) +{ + struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); + + /* release the blkcg and parent blkg refs this blkg has been holding */ + css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); + + wb_congested_put(blkg->wb_congested); + + blkg_free(blkg); +} +EXPORT_SYMBOL_GPL(__blkg_release_rcu); + /* * The next function used by blk_queue_for_each_rl(). It's a bit tricky * because the root blkg uses @q->root_rl instead of its own rl. @@ -1796,7 +1748,8 @@ void blkcg_maybe_throttle_current(void) blkg = blkg_lookup(blkcg, q); if (!blkg) goto out; - if (!blkg_tryget(blkg)) + blkg = blkg_try_get(blkg); + if (!blkg) goto out; rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 26a5dac80ed9..ce12515f9b9b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2435,7 +2435,6 @@ blk_qc_t generic_make_request(struct bio *bio) if (q) blk_queue_exit(q); q = bio->bi_disk->queue; - bio_reassociate_blkg(q, bio); flags = 0; if (bio->bi_opf & REQ_NOWAIT) flags = BLK_MQ_REQ_NOWAIT; diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 35c48d7b8f78..bb240a0c1309 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -480,12 +480,34 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - struct blkcg_gq *blkg = bio->bi_blkg; + struct blkcg *blkcg; + struct blkcg_gq *blkg; + struct request_queue *q = rqos->q; bool issue_as_root = bio_issue_as_root_blkg(bio); if (!blk_iolatency_enabled(blkiolat)) return; + rcu_read_lock(); + blkcg = bio_blkcg(bio); + bio_associate_blkcg(bio, &blkcg->css); + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + if (!lock) + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + if (!lock) + spin_unlock_irq(q->queue_lock); + } + if (!blkg) + goto out; + + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); + bio_associate_blkg(bio, blkg); +out: + rcu_read_unlock(); while (blkg && blkg->parent) { struct iolatency_grp *iolat = blkg_to_lat(blkg); if (!iolat) { @@ -706,7 +728,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) * We could be exiting, don't access the pd unless we have a * ref on the blkg. */ - if (!blkg_tryget(blkg)) + if (!blkg_try_get(blkg)) continue; iolat = blkg_to_lat(blkg); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4bda70e8db48..db1a3a2ae006 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2115,11 +2115,21 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif +static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) +{ +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + /* fallback to root_blkg if we fail to get a blkg ref */ + if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV)) + bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg); + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); +#endif +} + bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio) { struct throtl_qnode *qn = NULL; - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; @@ -2138,6 +2148,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, if (unlikely(blk_queue_bypass(q))) goto out_unlock; + blk_throtl_assoc_bio(tg, bio); blk_throtl_update_idletime(tg); sq = &tg->service_queue; diff --git a/block/bounce.c b/block/bounce.c index ec0d99995f5f..418677dcec60 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -276,9 +276,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, } } - bio_clone_blkg_association(bio, bio_src); - - blkcg_bio_issue_init(bio); + bio_clone_blkcg_association(bio, bio_src); return bio; } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 6a3d87dd3c1a..ed41aa978c4a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3759,7 +3759,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) uint64_t serial_nr; rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bio_blkcg(bio)->css.serial_nr; rcu_read_unlock(); /* @@ -3824,7 +3824,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, struct cfq_group *cfqg; rcu_read_lock(); - cfqg = cfq_lookup_cfqg(cfqd, __bio_blkcg(bio)); + cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); if (!cfqg) { cfqq = &cfqd->oom_cfqq; goto out; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index abad6d15f956..ea9debf59b22 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,7 +77,6 @@ #include #include #include -#include #include "loop.h" @@ -1761,8 +1760,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, /* always use the first bio's css */ #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { - cmd->css = &bio_blkcg(rq->bio)->css; + if (cmd->use_aio && rq->bio && rq->bio->bi_css) { + cmd->css = rq->bio->bi_css; css_get(cmd->css); } else #endif diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f3fb5bb8c82a..ac1cffd2a09b 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) !discard_bio) continue; bio_chain(discard_bio, bio); - bio_clone_blkg_association(discard_bio, bio); + bio_clone_blkcg_association(discard_bio, bio); if (mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), diff --git a/fs/buffer.c b/fs/buffer.c index 109f55196866..6f1ae3ac9789 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,6 +3060,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@ -3079,11 +3084,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - submit_bio(bio); return 0; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 2aa62d58d8dd..db7590178dfc 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; + wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; - wbc_init_bio(io->io_wbc, bio); return 0; } diff --git a/include/linux/bio.h b/include/linux/bio.h index b47c7f716731..056fb627edb3 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -503,31 +503,23 @@ do { \ disk_devt((bio)->bi_disk) #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -int bio_associate_blkg_from_page(struct bio *bio, struct page *page); +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); #else -static inline int bio_associate_blkg_from_page(struct bio *bio, - struct page *page) { return 0; } +static inline int bio_associate_blkcg_from_page(struct bio *bio, + struct page *page) { return 0; } #endif #ifdef CONFIG_BLK_CGROUP +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); -int bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css); -int bio_associate_create_blkg(struct request_queue *q, struct bio *bio); -int bio_reassociate_blkg(struct request_queue *q, struct bio *bio); void bio_disassociate_task(struct bio *bio); -void bio_clone_blkg_association(struct bio *dst, struct bio *src); +void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ -static inline int bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ return 0; } -static inline int bio_associate_create_blkg(struct request_queue *q, - struct bio *bio) { return 0; } -static inline int bio_reassociate_blkg(struct request_queue *q, struct bio *bio) -{ return 0; } +static inline int bio_associate_blkcg(struct bio *bio, + struct cgroup_subsys_state *blkcg_css) { return 0; } static inline void bio_disassociate_task(struct bio *bio) { } -static inline void bio_clone_blkg_association(struct bio *dst, - struct bio *src) { } +static inline void bio_clone_blkcg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1e76ceebeb5d..6d766a19f2bb 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -126,7 +126,7 @@ struct blkcg_gq { struct request_list rl; /* reference count */ - struct percpu_ref refcnt; + atomic_t refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; @@ -184,8 +184,6 @@ extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -232,59 +230,22 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); -/** - * blkcg_css - find the current css - * - * Find the css associated with either the kthread or the current task. - * This may return a dying css, so it is up to the caller to use tryget logic - * to confirm it is alive and well. - */ -static inline struct cgroup_subsys_state *blkcg_css(void) -{ - struct cgroup_subsys_state *css; - - css = kthread_blkcg(); - if (css) - return css; - return task_css(current, io_cgrp_id); -} static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } -/** - * __bio_blkcg - internal version of bio_blkcg for bfq and cfq - * - * DO NOT USE. - * There is a flaw using this version of the function. In particular, this was - * used in a broken paradigm where association was called on the given css. It - * is possible though that the returned css from task_css() is in the process - * of dying due to migration of the current task. So it is improper to assume - * *_get() is going to succeed. Both BFQ and CFQ rely on this logic and will - * take additional work to handle more gracefully. - */ -static inline struct blkcg *__bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return css_to_blkcg(blkcg_css()); -} - -/** - * bio_blkcg - grab the blkcg associated with a bio - * @bio: target bio - * - * This returns the blkcg associated with a bio, NULL if not associated. - * Callers are expected to either handle NULL or know association has been - * done prior to calling this. - */ static inline struct blkcg *bio_blkcg(struct bio *bio) { - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return NULL; + struct cgroup_subsys_state *css; + + if (bio && bio->bi_css) + return css_to_blkcg(bio->bi_css); + css = kthread_blkcg(); + if (css) + return css_to_blkcg(css); + return css_to_blkcg(task_css(current, io_cgrp_id)); } static inline bool blk_cgroup_congested(void) @@ -490,35 +451,26 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) */ static inline void blkg_get(struct blkcg_gq *blkg) { - percpu_ref_get(&blkg->refcnt); + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + atomic_inc(&blkg->refcnt); } /** - * blkg_tryget - try and get a blkg reference + * blkg_try_get - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ -static inline bool blkg_tryget(struct blkcg_gq *blkg) +static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) { - return percpu_ref_tryget(&blkg->refcnt); + if (atomic_inc_not_zero(&blkg->refcnt)) + return blkg; + return NULL; } -/** - * blkg_tryget_closest - try and get a blkg ref on the closet blkg - * @blkg: blkg to get - * - * This walks up the blkg tree to find the closest non-dying blkg and returns - * the blkg that it did association with as it may not be the passed in blkg. - */ -static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) -{ - while (!percpu_ref_tryget(&blkg->refcnt)) - blkg = blkg->parent; - return blkg; -} +void __blkg_release_rcu(struct rcu_head *rcu); /** * blkg_put - put a blkg reference @@ -526,7 +478,9 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) */ static inline void blkg_put(struct blkcg_gq *blkg) { - percpu_ref_put(&blkg->refcnt); + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + if (atomic_dec_and_test(&blkg->refcnt)) + call_rcu(&blkg->rcu_head, __blkg_release_rcu); } /** @@ -579,36 +533,25 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, rcu_read_lock(); - if (bio && bio->bi_blkg) { - blkcg = bio->bi_blkg->blkcg; - if (blkcg == &blkcg_root) - goto rl_use_root; - - blkg_get(bio->bi_blkg); - rcu_read_unlock(); - return &bio->bi_blkg->rl; - } + blkcg = bio_blkcg(bio); - blkcg = css_to_blkcg(blkcg_css()); + /* bypass blkg lookup and use @q->root_rl directly for root */ if (blkcg == &blkcg_root) - goto rl_use_root; + goto root_rl; + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) - blkg = __blkg_lookup_create(blkcg, q); - - if (blkg->blkcg == &blkcg_root || !blkg_tryget(blkg)) - goto rl_use_root; + goto root_rl; + blkg_get(blkg); rcu_read_unlock(); return &blkg->rl; - - /* - * Each blkg has its own request_list, however, the root blkcg - * uses the request_queue's root_rl. This is to avoid most - * overhead for the root blkcg. - */ -rl_use_root: +root_rl: rcu_read_unlock(); return &q->root_rl; } @@ -854,26 +797,32 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg struct bio *bio) { return false; } #endif - -static inline void blkcg_bio_issue_init(struct bio *bio) -{ - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); -} - static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { + struct blkcg *blkcg; struct blkcg_gq *blkg; bool throtl = false; rcu_read_lock(); + blkcg = bio_blkcg(bio); + + /* associate blkcg if bio hasn't attached one */ + bio_associate_blkcg(bio, &blkcg->css); - bio_associate_create_blkg(q, bio); - blkg = bio->bi_blkg; + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + spin_unlock_irq(q->queue_lock); + } throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { + blkg = blkg ?: q->root_blkg; /* * If the bio is flagged with BIO_QUEUE_ENTERED it means this * is a split bio and we would have already accounted for the @@ -885,8 +834,6 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } - blkcg_bio_issue_init(bio); - rcu_read_unlock(); return !throtl; } @@ -983,7 +930,6 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } -static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, @@ -999,7 +945,6 @@ static inline void blk_put_rl(struct request_list *rl) { } static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } -static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { return true; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 093a818c5b68..1dcf652ba0aa 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -178,6 +178,7 @@ struct bio { * release. Read comment on top of bio_associate_current(). */ struct io_context *bi_ioc; + struct cgroup_subsys_state *bi_css; struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; #endif diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b8bcbdeb2eac..32c553556bbd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -93,8 +93,6 @@ extern struct css_set init_css_set; bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); -struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, - struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 738a0c24874f..fdfd04e348f6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -246,8 +246,7 @@ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup - * writeback context. Must be called after the bio has been associated with - * a device. + * writeback context. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { @@ -258,7 +257,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) * regular writeback instead of writing things out itself. */ if (wbc->wb) - bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); + bio_associate_blkcg(bio, wbc->wb->blkcg_css); } #else /* CONFIG_CGROUP_WRITEBACK */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4c1cf0969a80..4a3dae2a8283 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -492,7 +492,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, } /** - * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * @@ -501,8 +501,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ -static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, - struct cgroup_subsys *ss) +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); @@ -522,35 +522,6 @@ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, return cgroup_css(cgrp, ss); } -/** - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest - * - * Find and get the effective css of @cgrp for @ss. The effective css is - * defined as the matching css of the nearest ancestor including self which - * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, - * the root css is returned, so this function always returns a valid css. - * - * The returned css is not guaranteed to be online, and therefore it is the - * callers responsiblity to tryget a reference for it. - */ -struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - do { - css = cgroup_css(cgrp, ss); - - if (css) - return css; - cgrp = cgroup_parent(cgrp); - } while (cgrp); - - return init_css_set.subsys[ss->id]; -} - /** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest @@ -633,11 +604,10 @@ EXPORT_SYMBOL_GPL(of_css); * * Should be called under cgroup_[tree_]mutex. */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css_by_mask(cgrp, \ - cgroup_subsys[(ssid)]))) \ - ; \ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ + ; \ else /** @@ -1036,7 +1006,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ - template[i] = cgroup_e_css_by_mask(cgrp, ss); + template[i] = cgroup_e_css(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want @@ -3053,7 +3023,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) return ret; /* - * At this point, cgroup_e_css_by_mask() results reflect the new csses + * At this point, cgroup_e_css() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fac0ddf8a8e2..2868d85f1fb1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return NULL; - if (!bio->bi_blkg) + if (!bio->bi_css) return NULL; - return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); + return cgroup_get_kernfs_id(bio->bi_css->cgroup); } #else static union kernfs_node_id * diff --git a/mm/page_io.c b/mm/page_io.c index 573d3663d846..aafd19ec1db4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -339,7 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, goto out; } bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); - bio_associate_blkg_from_page(bio, page); + bio_associate_blkcg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); -- cgit v1.2.3-71-gd317 From 6d212db11947ae5464e4717536ed9faf61c01e86 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 15 Oct 2018 10:30:23 +0200 Subject: mm: add mm_pxd_folded checks to pgtable_bytes accounting functions The common mm code calls mm_dec_nr_pmds() and mm_dec_nr_puds() in free_pgtables() if the address range spans a full pud or pmd. If mm_dec_nr_puds/mm_dec_nr_pmds are non-empty due to configuration settings they blindly subtract the size of the pmd or pud table from pgtable_bytes even if the pud or pmd page table layer is folded. Add explicit mm_[pmd|pud]_folded checks to the four pgtable_bytes accounting functions mm_inc_nr_puds, mm_inc_nr_pmds, mm_dec_nr_puds and mm_dec_nr_pmds. As the check for folded page tables can be overwritten by the architecture, this allows to keep a correct pgtable_bytes value for platforms that use a dynamic number of page table levels. Acked-by: Kirill A. Shutemov Signed-off-by: Martin Schwidefsky --- include/linux/mm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index daa2b8f1e9a8..a3701e91bb57 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1742,11 +1742,15 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); static inline void mm_inc_nr_puds(struct mm_struct *mm) { + if (mm_pud_folded(mm)) + return; atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { + if (mm_pud_folded(mm)) + return; atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif @@ -1766,11 +1770,15 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); static inline void mm_inc_nr_pmds(struct mm_struct *mm) { + if (mm_pmd_folded(mm)) + return; atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { + if (mm_pmd_folded(mm)) + return; atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif -- cgit v1.2.3-71-gd317 From a7ad38b0dd3c1ba8d6e5a55241e875e9db8331ab Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sat, 3 Nov 2018 00:51:28 +0800 Subject: clocksource/drivers/c-sky: Add C-SKY SMP timer The driver is for C-SKY SMP timer. It only supports oneshot event and 32bit overflow for clocksource. Per cpu core has one timer and all timers share one clock-counter-input from the same clocksource. This use mfcr&mtcr instructions to access the regs. Signed-off-by: Guo Ren Cc: Daniel Lezcano Cc: Thomas Gleixner Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 10 +++ drivers/clocksource/Makefile | 1 + drivers/clocksource/timer-mp-csky.c | 173 ++++++++++++++++++++++++++++++++++++ include/linux/cpuhotplug.h | 1 + 4 files changed, 185 insertions(+) create mode 100644 drivers/clocksource/timer-mp-csky.c (limited to 'include/linux') diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index a11f4ba98b05..591c9a8649a5 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -620,4 +620,14 @@ config RISCV_TIMER is accessed via both the SBI and the rdcycle instruction. This is required for all RISC-V systems. +config CSKY_MP_TIMER + bool "SMP Timer for the C-SKY platform" if COMPILE_TEST + depends on CSKY + select TIMER_OF + help + Say yes here to enable C-SKY SMP timer driver used for C-SKY SMP + system. + csky,mptimer is not only used in SMP system, it also could be used + single core system. It's not a mmio reg and it use mtcr/mfcr instruction. + endmenu diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index db51b2427e8a..5ce82d39cda7 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -79,3 +79,4 @@ obj-$(CONFIG_CLKSRC_ST_LPC) += clksrc_st_lpc.o obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-$(CONFIG_ATCPIT100_TIMER) += timer-atcpit100.o obj-$(CONFIG_RISCV_TIMER) += riscv_timer.o +obj-$(CONFIG_CSKY_MP_TIMER) += timer-mp-csky.o diff --git a/drivers/clocksource/timer-mp-csky.c b/drivers/clocksource/timer-mp-csky.c new file mode 100644 index 000000000000..a8acc431a774 --- /dev/null +++ b/drivers/clocksource/timer-mp-csky.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. + +#include +#include +#include +#include +#include +#include + +#include "timer-of.h" + +#define PTIM_CCVR "cr<3, 14>" +#define PTIM_CTLR "cr<0, 14>" +#define PTIM_LVR "cr<6, 14>" +#define PTIM_TSR "cr<1, 14>" + +static int csky_mptimer_irq; + +static int csky_mptimer_set_next_event(unsigned long delta, + struct clock_event_device *ce) +{ + mtcr(PTIM_LVR, delta); + + return 0; +} + +static int csky_mptimer_shutdown(struct clock_event_device *ce) +{ + mtcr(PTIM_CTLR, 0); + + return 0; +} + +static int csky_mptimer_oneshot(struct clock_event_device *ce) +{ + mtcr(PTIM_CTLR, 1); + + return 0; +} + +static int csky_mptimer_oneshot_stopped(struct clock_event_device *ce) +{ + mtcr(PTIM_CTLR, 0); + + return 0; +} + +static DEFINE_PER_CPU(struct timer_of, csky_to) = { + .flags = TIMER_OF_CLOCK, + .clkevt = { + .rating = 300, + .features = CLOCK_EVT_FEAT_PERCPU | + CLOCK_EVT_FEAT_ONESHOT, + .set_state_shutdown = csky_mptimer_shutdown, + .set_state_oneshot = csky_mptimer_oneshot, + .set_state_oneshot_stopped = csky_mptimer_oneshot_stopped, + .set_next_event = csky_mptimer_set_next_event, + }, +}; + +static irqreturn_t csky_timer_interrupt(int irq, void *dev) +{ + struct timer_of *to = this_cpu_ptr(&csky_to); + + mtcr(PTIM_TSR, 0); + + to->clkevt.event_handler(&to->clkevt); + + return IRQ_HANDLED; +} + +/* + * clock event for percpu + */ +static int csky_mptimer_starting_cpu(unsigned int cpu) +{ + struct timer_of *to = per_cpu_ptr(&csky_to, cpu); + + to->clkevt.cpumask = cpumask_of(cpu); + + clockevents_config_and_register(&to->clkevt, timer_of_rate(to), + 2, ULONG_MAX); + + enable_percpu_irq(csky_mptimer_irq, 0); + + return 0; +} + +static int csky_mptimer_dying_cpu(unsigned int cpu) +{ + disable_percpu_irq(csky_mptimer_irq); + + return 0; +} + +/* + * clock source + */ +static u64 sched_clock_read(void) +{ + return (u64)mfcr(PTIM_CCVR); +} + +static u64 clksrc_read(struct clocksource *c) +{ + return (u64)mfcr(PTIM_CCVR); +} + +struct clocksource csky_clocksource = { + .name = "csky", + .rating = 400, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .read = clksrc_read, +}; + +static int __init csky_mptimer_init(struct device_node *np) +{ + int ret, cpu, cpu_rollback; + struct timer_of *to = NULL; + + /* + * Csky_mptimer is designed for C-SKY SMP multi-processors and + * every core has it's own private irq and regs for clkevt and + * clksrc. + * + * The regs is accessed by cpu instruction: mfcr/mtcr instead of + * mmio map style. So we needn't mmio-address in dts, but we still + * need to give clk and irq number. + * + * We use private irq for the mptimer and irq number is the same + * for every core. So we use request_percpu_irq() in timer_of_init. + */ + csky_mptimer_irq = irq_of_parse_and_map(np, 0); + if (csky_mptimer_irq <= 0) + return -EINVAL; + + ret = request_percpu_irq(csky_mptimer_irq, csky_timer_interrupt, + "csky_mp_timer", &csky_to); + if (ret) + return -EINVAL; + + for_each_possible_cpu(cpu) { + to = per_cpu_ptr(&csky_to, cpu); + ret = timer_of_init(np, to); + if (ret) + goto rollback; + } + + clocksource_register_hz(&csky_clocksource, timer_of_rate(to)); + sched_clock_register(sched_clock_read, 32, timer_of_rate(to)); + + ret = cpuhp_setup_state(CPUHP_AP_CSKY_TIMER_STARTING, + "clockevents/csky/timer:starting", + csky_mptimer_starting_cpu, + csky_mptimer_dying_cpu); + if (ret) + return -EINVAL; + + return 0; + +rollback: + for_each_possible_cpu(cpu_rollback) { + if (cpu_rollback == cpu) + break; + + to = per_cpu_ptr(&csky_to, cpu_rollback); + timer_of_cleanup(to); + } + return -EINVAL; +} +TIMER_OF_DECLARE(csky_mptimer, "csky,mptimer", csky_mptimer_init); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index caf40ad0bbc6..e0cd2baa8380 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -126,6 +126,7 @@ enum cpuhp_state { CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, CPUHP_AP_RISCV_TIMER_STARTING, + CPUHP_AP_CSKY_TIMER_STARTING, CPUHP_AP_KVM_STARTING, CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, CPUHP_AP_KVM_ARM_VGIC_STARTING, -- cgit v1.2.3-71-gd317 From 17b8b74c0f8dbf9b9e3301f9ca5b65dd1c079951 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 19 Oct 2018 19:35:19 +0200 Subject: netfilter: ipset: Correct rcu_dereference() call in ip_set_put_comment() The function is called when rcu_read_lock() is held and not when rcu_read_lock_bh() is held. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set_comment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h index 8e2bab1e8e90..70877f8de7e9 100644 --- a/include/linux/netfilter/ipset/ip_set_comment.h +++ b/include/linux/netfilter/ipset/ip_set_comment.h @@ -43,11 +43,11 @@ ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, rcu_assign_pointer(comment->c, c); } -/* Used only when dumping a set, protected by rcu_read_lock_bh() */ +/* Used only when dumping a set, protected by rcu_read_lock() */ static inline int ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) { - struct ip_set_comment_rcu *c = rcu_dereference_bh(comment->c); + struct ip_set_comment_rcu *c = rcu_dereference(comment->c); if (!c) return 0; -- cgit v1.2.3-71-gd317 From 94e297c50b529f5d01cfd1dbc808d61e95180ab7 Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Fri, 2 Nov 2018 15:47:53 -0700 Subject: include/linux/notifier.h: SRCU: fix ctags ctags indexing ("make tags" command) throws this warning: ctags: Warning: include/linux/notifier.h:125: null expansion of name pattern "\1" This is the result of DEFINE_PER_CPU() macro expansion. Fix that by getting rid of line break. Similar fix was already done in commit 25528213fe9f ("tags: Fix DEFINE_PER_CPU expansions"), but this one probably wasn't noticed. Link: http://lkml.kernel.org/r/20181030202808.28027-1-semen.protsenko@linaro.org Fixes: 9c80172b902d ("kernel/SRCU: provide a static initializer") Signed-off-by: Sam Protsenko Cc: Sebastian Andrzej Siewior Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/notifier.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index f35c7bf76143..0096a05395e3 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -122,8 +122,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); #ifdef CONFIG_TREE_SRCU #define _SRCU_NOTIFIER_HEAD(name, mod) \ - static DEFINE_PER_CPU(struct srcu_data, \ - name##_head_srcu_data); \ + static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \ mod struct srcu_notifier_head name = \ SRCU_NOTIFIER_INIT(name, name##_head_srcu_data) -- cgit v1.2.3-71-gd317 From 89c83fb539f95491be80cdd5158e6f0ce329e317 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 2 Nov 2018 15:48:31 -0700 Subject: mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask THP allocation mode is quite complex and it depends on the defrag mode. This complexity is hidden in alloc_hugepage_direct_gfpmask from a large part currently. The NUMA special casing (namely __GFP_THISNODE) is however independent and placed in alloc_pages_vma currently. This both adds an unnecessary branch to all vma based page allocation requests and it makes the code more complex unnecessarily as well. Not to mention that e.g. shmem THP used to do the node reclaiming unconditionally regardless of the defrag mode until recently. This was not only unexpected behavior but it was also hardly a good default behavior and I strongly suspect it was just a side effect of the code sharing more than a deliberate decision which suggests that such a layering is wrong. Get rid of the thp special casing from alloc_pages_vma and move the logic to alloc_hugepage_direct_gfpmask. __GFP_THISNODE is applied to the resulting gfp mask only when the direct reclaim is not requested and when there is no explicit numa binding to preserve the current logic. Please note that there's also a slight difference wrt MPOL_BIND now. The previous code would avoid using __GFP_THISNODE if the local node was outside of policy_nodemask(). After this patch __GFP_THISNODE is avoided for all MPOL_BIND policies. So there's a difference that if local node is actually allowed by the bind policy's nodemask, previously __GFP_THISNODE would be added, but now it won't be. From the behavior POV this is still correct because the policy nodemask is used. Link: http://lkml.kernel.org/r/20180925120326.24392-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Alex Williamson Cc: Andrea Arcangeli Cc: David Rientjes Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Stefan Priebe - Profihost AG Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 +++------ include/linux/mempolicy.h | 2 ++ mm/huge_memory.c | 38 +++++++++++++++++++++------- mm/mempolicy.c | 63 +++-------------------------------------------- mm/shmem.c | 2 +- 5 files changed, 40 insertions(+), 77 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 24bcc5eec6b4..76f8db0b0e71 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + int node); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ - alloc_pages(gfp_mask, order) -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) + alloc_pages_vma(gfp_mask, 0, vma, addr, node) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5228c62af416..bac395f1d00a 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4e4ef8fa479d..55478ab3c83b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -629,21 +629,40 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + gfp_t this_node = 0; + +#ifdef CONFIG_NUMA + struct mempolicy *pol; + /* + * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not + * specified, to express a general desire to stay on the current + * node for optimistic allocation attempts. If the defrag mode + * and/or madvise hint requires the direct reclaim then we prefer + * to fallback to other node rather than node reclaim because that + * can lead to excessive reclaim even though there is free memory + * on other nodes. We expect that NUMA preferences are specified + * by memory policies. + */ + pol = get_vma_policy(vma, addr); + if (pol->mode != MPOL_BIND) + this_node = __GFP_THISNODE; + mpol_cond_put(pol); +#endif if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + __GFP_KSWAPD_RECLAIM | this_node); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - 0); - return GFP_TRANSHUGE_LIGHT; + this_node); + return GFP_TRANSHUGE_LIGHT | this_node; } /* Caller must hold page table lock. */ @@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); + new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, + haddr, numa_node_id()); } else new_page = NULL; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 58fb833fce0c..5837a067124d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start) } else if (PageTransHuge(page)) { struct page *thp; - thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, - HPAGE_PMD_ORDER); + thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, + address, numa_node_id()); if (!thp) return NULL; prep_transhuge_page(thp); @@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); @@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). - * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. @@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, int node) { struct mempolicy *pol; struct page *page; @@ -2040,60 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; - - /* - * For hugepage allocation and non-interleave policy which - * allows the current node (or other explicitly preferred - * node) we only try to allocate from the current/preferred - * node and don't fall back to other nodes, as the cost of - * remote accesses would likely offset THP benefits. - * - * If the policy is interleave, or does not allow the current - * node in its nodemask, we allocate the standard way. - */ - if (pol->mode == MPOL_PREFERRED && - !(pol->flags & MPOL_F_LOCAL)) - hpage_node = pol->v.preferred_node; - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); - /* - * We cannot invoke reclaim if __GFP_THISNODE - * is set. Invoking reclaim with - * __GFP_THISNODE set, would cause THP - * allocations to trigger heavy swapping - * despite there may be tons of free memory - * (including potentially plenty of THP - * already available in the buddy) on all the - * other NUMA nodes. - * - * At most we could invoke compaction when - * __GFP_THISNODE is set (but we would need to - * refrain from invoking reclaim even if - * compaction returned COMPACT_SKIPPED because - * there wasn't not enough memory to succeed - * compaction). For now just avoid - * __GFP_THISNODE instead of limiting the - * allocation path to a strict and single - * compaction invocation. - * - * Supposedly if direct reclaim was enabled by - * the caller, the app prefers THP regardless - * of the node it comes from so this would be - * more desiderable behavior than only - * providing THP originated from the local - * node in such case. - */ - if (!(gfp & __GFP_DIRECT_RECLAIM)) - gfp |= __GFP_THISNODE; - page = __alloc_pages_node(hpage_node, gfp, order); - goto out; - } - } - nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); diff --git a/mm/shmem.c b/mm/shmem.c index 56bf122e0bb4..ea26d7a0342d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); -- cgit v1.2.3-71-gd317 From 3e59020abf0f88182730527ee5b862e786eb485a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Oct 2018 08:39:12 -0700 Subject: net: bql: add __netdev_tx_sent_queue() When qdisc_run() tries to use BQL budget to bulk-dequeue a batch of packets, GSO can later transform this list in another list of skbs, and each skb is sent to device ndo_start_xmit(), one at a time, with skb->xmit_more being set to one but for last skb. Problem is that very often, BQL limit is hit in the middle of the packet train, forcing dev_hard_start_xmit() to stop the bulk send and requeue the end of the list. BQL role is to avoid head of line blocking, making sure a qdisc can deliver high priority packets before low priority ones. But there is no way requeued packets can be bypassed by fresh packets in the qdisc. Aborting the bulk send increases TX softirqs, and hot cache lines (after skb_segment()) are wasted. Note that for TSO packets, we never split a packet in the middle because of BQL limit being hit. Drivers should be able to update BQL counters without flipping/caring about BQL status, if the current skb has xmit_more set. Upper layers are ultimately responsible to stop sending another packet train when BQL limit is hit. Code template in a driver might look like the following : send_doorbell = __netdev_tx_sent_queue(tx_queue, nr_bytes, skb->xmit_more); Note that __netdev_tx_sent_queue() use is not mandatory, since following patch will change dev_hard_start_xmit() to not care about BQL status. But it is highly recommended so that xmit_more full benefits can be reached (less doorbells sent, and less atomic operations as well) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dc1d9ed33b31..857f8abf7b91 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3190,6 +3190,26 @@ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, #endif } +/* Variant of netdev_tx_sent_queue() for drivers that are aware + * that they should not test BQL status themselves. + * We do want to change __QUEUE_STATE_STACK_XOFF only for the last + * skb of a batch. + * Returns true if the doorbell must be used to kick the NIC. + */ +static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue, + unsigned int bytes, + bool xmit_more) +{ + if (xmit_more) { +#ifdef CONFIG_BQL + dql_queued(&dev_queue->dql, bytes); +#endif + return netif_tx_queue_stopped(dev_queue); + } + netdev_tx_sent_queue(dev_queue, bytes); + return true; +} + /** * netdev_sent_queue - report the number of bytes queued to hardware * @dev: network device -- cgit v1.2.3-71-gd317 From 35b69a420bfb56b7b74cb635ea903db05e357bec Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 4 Nov 2018 03:48:54 +0000 Subject: clockevents/drivers/i8253: Add support for PIT shutdown quirk Add support for platforms where pit_shutdown() doesn't work because of a quirk in the PIT emulation. On these platforms setting the counter register to zero causes the PIT to start running again, negating the shutdown. Provide a global variable that controls whether the counter register is zero'ed, which platform specific code can override. Signed-off-by: Michael Kelley Signed-off-by: Thomas Gleixner Cc: "gregkh@linuxfoundation.org" Cc: "devel@linuxdriverproject.org" Cc: "daniel.lezcano@linaro.org" Cc: "virtualization@lists.linux-foundation.org" Cc: "jgross@suse.com" Cc: "akataria@vmware.com" Cc: "olaf@aepfle.de" Cc: "apw@canonical.com" Cc: vkuznets Cc: "jasowang@redhat.com" Cc: "marcelo.cerri@canonical.com" Cc: KY Srinivasan Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1541303219-11142-2-git-send-email-mikelley@microsoft.com --- drivers/clocksource/i8253.c | 14 ++++++++++++-- include/linux/i8253.h | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c index 9c38895542f4..d4350bb10b83 100644 --- a/drivers/clocksource/i8253.c +++ b/drivers/clocksource/i8253.c @@ -20,6 +20,13 @@ DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); +/* + * Handle PIT quirk in pit_shutdown() where zeroing the counter register + * restarts the PIT, negating the shutdown. On platforms with the quirk, + * platform specific code can set this to false. + */ +bool i8253_clear_counter_on_shutdown __ro_after_init = true; + #ifdef CONFIG_CLKSRC_I8253 /* * Since the PIT overflows every tick, its not very useful @@ -109,8 +116,11 @@ static int pit_shutdown(struct clock_event_device *evt) raw_spin_lock(&i8253_lock); outb_p(0x30, PIT_MODE); - outb_p(0, PIT_CH0); - outb_p(0, PIT_CH0); + + if (i8253_clear_counter_on_shutdown) { + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); + } raw_spin_unlock(&i8253_lock); return 0; diff --git a/include/linux/i8253.h b/include/linux/i8253.h index e6bb36a97519..8336b2f6f834 100644 --- a/include/linux/i8253.h +++ b/include/linux/i8253.h @@ -21,6 +21,7 @@ #define PIT_LATCH ((PIT_TICK_RATE + HZ/2) / HZ) extern raw_spinlock_t i8253_lock; +extern bool i8253_clear_counter_on_shutdown; extern struct clock_event_device i8253_clockevent; extern void clockevent_i8253_init(bool oneshot); -- cgit v1.2.3-71-gd317 From d098093ba06eb032057d1aca1c2e45889e099d00 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Sun, 28 Oct 2018 12:29:55 +0100 Subject: mtd: nand: Fix nanddev_neraseblocks() nanddev_neraseblocks() currently returns the number pages per LUN instead of the total number of eraseblocks. Fixes: 9c3736a3de21 ("mtd: nand: Add core infrastructure to deal with NAND devices") Cc: Signed-off-by: Boris Brezillon Reviewed-by: Miquel Raynal --- include/linux/mtd/nand.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index abe975c87b90..78b86dea2f29 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -324,9 +324,8 @@ static inline unsigned int nanddev_ntargets(const struct nand_device *nand) */ static inline unsigned int nanddev_neraseblocks(const struct nand_device *nand) { - return (u64)nand->memorg.luns_per_target * - nand->memorg.eraseblocks_per_lun * - nand->memorg.pages_per_eraseblock; + return nand->memorg.ntargets * nand->memorg.luns_per_target * + nand->memorg.eraseblocks_per_lun; } /** -- cgit v1.2.3-71-gd317 From 163c8d54a997153ee1a1e07fcac087492ad85b37 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 5 Nov 2018 07:36:28 +0100 Subject: compiler: remove __no_sanitize_address_or_inline again The __no_sanitize_address_or_inline and __no_kasan_or_inline defines are almost identical. The only difference is that __no_kasan_or_inline does not have the 'notrace' attribute. To be able to replace __no_sanitize_address_or_inline with the older definition, add 'notrace' to __no_kasan_or_inline and change to two users of __no_sanitize_address_or_inline in the s390 code. The 'notrace' option is necessary for e.g. the __load_psw_mask function in arch/s390/include/asm/processor.h. Without the option it is possible to trace __load_psw_mask which leads to kernel stack overflow. Signed-off-by: Martin Schwidefsky Pointed-out-by: Andrey Ryabinin Acked-by: Steven Rostedt (VMware) Signed-off-by: Linus Torvalds --- arch/s390/include/asm/processor.h | 4 ++-- include/linux/compiler-gcc.h | 12 ------------ include/linux/compiler.h | 2 +- 3 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 302795c47c06..81038ab357ce 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -236,7 +236,7 @@ static inline unsigned long current_stack_pointer(void) return sp; } -static __no_sanitize_address_or_inline unsigned short stap(void) +static __no_kasan_or_inline unsigned short stap(void) { unsigned short cpu_address; @@ -330,7 +330,7 @@ static inline void __load_psw(psw_t psw) * Set PSW mask to specified value, while leaving the * PSW addr pointing to the next instruction. */ -static __no_sanitize_address_or_inline void __load_psw_mask(unsigned long mask) +static __no_kasan_or_inline void __load_psw_mask(unsigned long mask) { unsigned long addr; psw_t psw; diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index c0f5db3a9621..2010493e1040 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -143,18 +143,6 @@ #define KASAN_ABI_VERSION 3 #endif -/* - * Because __no_sanitize_address conflicts with inlining: - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368 - * we do one or the other. - */ -#ifdef CONFIG_KASAN -#define __no_sanitize_address_or_inline \ - __no_sanitize_address __maybe_unused notrace -#else -#define __no_sanitize_address_or_inline inline -#endif - #if GCC_VERSION >= 50100 #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 18c80cfa4fc4..06396c1cf127 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -189,7 +189,7 @@ void __read_once_size(const volatile void *p, void *res, int size) * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368 * '__maybe_unused' allows us to avoid defined-but-not-used warnings. */ -# define __no_kasan_or_inline __no_sanitize_address __maybe_unused +# define __no_kasan_or_inline __no_sanitize_address notrace __maybe_unused #else # define __no_kasan_or_inline __always_inline #endif -- cgit v1.2.3-71-gd317 From 4c0608f4a0e76dfb82d3accd20081f4bf47ed143 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 30 Oct 2018 09:45:55 -0400 Subject: XArray: Regularise xa_reserve The xa_reserve() function was a little unusual in that it attempted to be callable for all kinds of locking scenarios. Make it look like the other APIs with __xa_reserve, xa_reserve_bh and xa_reserve_irq variants. Signed-off-by: Matthew Wilcox --- Documentation/core-api/xarray.rst | 13 +++++++ include/linux/xarray.h | 80 ++++++++++++++++++++++++++++++++++++++- lib/test_xarray.c | 6 +++ lib/xarray.c | 18 ++++----- 4 files changed, 105 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index a4e705108f42..65c77a81b689 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -105,6 +105,15 @@ may result in the entry being marked at some, but not all of the other indices. Storing into one index may result in the entry retrieved by some, but not all of the other indices changing. +Sometimes you need to ensure that a subsequent call to :c:func:`xa_store` +will not need to allocate memory. The :c:func:`xa_reserve` function +will store a reserved entry at the indicated index. Users of the normal +API will see this entry as containing ``NULL``. If you do not need to +use the reserved entry, you can call :c:func:`xa_release` to remove the +unused entry. If another user has stored to the entry in the meantime, +:c:func:`xa_release` will do nothing; if instead you want the entry to +become ``NULL``, you should use :c:func:`xa_erase`. + Finally, you can remove all entries from an XArray by calling :c:func:`xa_destroy`. If the XArray entries are pointers, you may wish to free the entries first. You can do this by iterating over all present @@ -167,6 +176,9 @@ Takes xa_lock internally: * :c:func:`xa_alloc` * :c:func:`xa_alloc_bh` * :c:func:`xa_alloc_irq` + * :c:func:`xa_reserve` + * :c:func:`xa_reserve_bh` + * :c:func:`xa_reserve_irq` * :c:func:`xa_destroy` * :c:func:`xa_set_mark` * :c:func:`xa_clear_mark` @@ -177,6 +189,7 @@ Assumes xa_lock held on entry: * :c:func:`__xa_erase` * :c:func:`__xa_cmpxchg` * :c:func:`__xa_alloc` + * :c:func:`__xa_reserve` * :c:func:`__xa_set_mark` * :c:func:`__xa_clear_mark` diff --git a/include/linux/xarray.h b/include/linux/xarray.h index d9514928ddac..c2cb0426c60c 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -291,7 +291,6 @@ void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); -int xa_reserve(struct xarray *, unsigned long index, gfp_t); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); @@ -455,6 +454,7 @@ void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __xa_alloc(struct xarray *, u32 *id, u32 max, void *entry, gfp_t); +int __xa_reserve(struct xarray *, unsigned long index, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); @@ -621,6 +621,84 @@ static inline int xa_alloc_irq(struct xarray *xa, u32 *id, u32 max, void *entry, return err; } +/** + * xa_reserve() - Reserve this index in the XArray. + * @xa: XArray. + * @index: Index into array. + * @gfp: Memory allocation flags. + * + * Ensures there is somewhere to store an entry at @index in the array. + * If there is already something stored at @index, this function does + * nothing. If there was nothing there, the entry is marked as reserved. + * Loading from a reserved entry returns a %NULL pointer. + * + * If you do not use the entry that you have reserved, call xa_release() + * or xa_erase() to free any unnecessary memory. + * + * Context: Any context. Takes and releases the xa_lock. + * May sleep if the @gfp flags permit. + * Return: 0 if the reservation succeeded or -ENOMEM if it failed. + */ +static inline +int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + int ret; + + xa_lock(xa); + ret = __xa_reserve(xa, index, gfp); + xa_unlock(xa); + + return ret; +} + +/** + * xa_reserve_bh() - Reserve this index in the XArray. + * @xa: XArray. + * @index: Index into array. + * @gfp: Memory allocation flags. + * + * A softirq-disabling version of xa_reserve(). + * + * Context: Any context. Takes and releases the xa_lock while + * disabling softirqs. + * Return: 0 if the reservation succeeded or -ENOMEM if it failed. + */ +static inline +int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + int ret; + + xa_lock_bh(xa); + ret = __xa_reserve(xa, index, gfp); + xa_unlock_bh(xa); + + return ret; +} + +/** + * xa_reserve_irq() - Reserve this index in the XArray. + * @xa: XArray. + * @index: Index into array. + * @gfp: Memory allocation flags. + * + * An interrupt-disabling version of xa_reserve(). + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. + * Return: 0 if the reservation succeeded or -ENOMEM if it failed. + */ +static inline +int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + int ret; + + xa_lock_irq(xa); + ret = __xa_reserve(xa, index, gfp); + xa_unlock_irq(xa); + + return ret; +} + /* Everything below here is the Advanced API. Proceed with caution. */ /* diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 126127658b49..e5294b20b52f 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -373,6 +373,12 @@ static noinline void check_reserve(struct xarray *xa) xa_erase_index(xa, 12345678); XA_BUG_ON(xa, !xa_empty(xa)); + /* And so does xa_insert */ + xa_reserve(xa, 12345678, GFP_KERNEL); + XA_BUG_ON(xa, xa_insert(xa, 12345678, xa_mk_value(12345678), 0) != 0); + xa_erase_index(xa, 12345678); + XA_BUG_ON(xa, !xa_empty(xa)); + /* Can iterate through a reserved entry */ xa_store_index(xa, 5, GFP_KERNEL); xa_reserve(xa, 6, GFP_KERNEL); diff --git a/lib/xarray.c b/lib/xarray.c index e7be4e47c6a9..9cab8cfef8a8 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1488,7 +1488,7 @@ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, EXPORT_SYMBOL(__xa_cmpxchg); /** - * xa_reserve() - Reserve this index in the XArray. + * __xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. @@ -1496,33 +1496,29 @@ EXPORT_SYMBOL(__xa_cmpxchg); * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. - * Loads from @index will continue to see a %NULL pointer until a - * subsequent store to @index. + * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * - * Context: Process context. Takes and releases the xa_lock, IRQ or BH safe - * if specified in XArray flags. May sleep if the @gfp flags permit. + * Context: Any context. Expects the xa_lock to be held on entry. May + * release the lock, sleep and reacquire the lock if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ -int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) +int __xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { XA_STATE(xas, xa, index); - unsigned int lock_type = xa_lock_type(xa); void *curr; do { - xas_lock_type(&xas, lock_type); curr = xas_load(&xas); if (!curr) xas_store(&xas, XA_ZERO_ENTRY); - xas_unlock_type(&xas, lock_type); - } while (xas_nomem(&xas, gfp)); + } while (__xas_nomem(&xas, gfp)); return xas_error(&xas); } -EXPORT_SYMBOL(xa_reserve); +EXPORT_SYMBOL(__xa_reserve); #ifdef CONFIG_XARRAY_MULTI static void xas_set_range(struct xa_state *xas, unsigned long first, -- cgit v1.2.3-71-gd317 From c5beb07e7a06b24f4f27304f6282b5dbd929543b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Oct 2018 14:39:28 -0400 Subject: XArray: Unify xa_cmpxchg and __xa_cmpxchg xa_cmpxchg() was one of the largest functions in the xarray implementation. By turning it into a wrapper and having the callers take the lock (like several other functions), we save 160 bytes on a tinyconfig build and reduce the duplication in xarray.c. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 113 ++++++++++++++++++++++++++++++------------------- lib/xarray.c | 41 ------------------ 2 files changed, 69 insertions(+), 85 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index c2cb0426c60c..8e59d4fbd55e 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -289,8 +289,6 @@ struct xarray { void xa_init_flags(struct xarray *, gfp_t flags); void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); -void *xa_cmpxchg(struct xarray *, unsigned long index, - void *old, void *entry, gfp_t); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); @@ -359,48 +357,6 @@ static inline void *xa_erase(struct xarray *xa, unsigned long index) return xa_store(xa, index, NULL, 0); } -/** - * xa_insert() - Store this entry in the XArray unless another entry is - * already present. - * @xa: XArray. - * @index: Index into array. - * @entry: New entry. - * @gfp: Memory allocation flags. - * - * If you would rather see the existing entry in the array, use xa_cmpxchg(). - * This function is for users who don't care what the entry is, only that - * one is present. - * - * Context: Process context. Takes and releases the xa_lock. - * May sleep if the @gfp flags permit. - * Return: 0 if the store succeeded. -EEXIST if another entry was present. - * -ENOMEM if memory could not be allocated. - */ -static inline int xa_insert(struct xarray *xa, unsigned long index, - void *entry, gfp_t gfp) -{ - void *curr = xa_cmpxchg(xa, index, NULL, entry, gfp); - if (!curr) - return 0; - if (xa_is_err(curr)) - return xa_err(curr); - return -EEXIST; -} - -/** - * xa_release() - Release a reserved entry. - * @xa: XArray. - * @index: Index of entry. - * - * After calling xa_reserve(), you can call this function to release the - * reservation. If the entry at @index has been stored to, this function - * will do nothing. - */ -static inline void xa_release(struct xarray *xa, unsigned long index) -{ - xa_cmpxchg(xa, index, NULL, NULL, 0); -} - /** * xa_for_each() - Iterate over a portion of an XArray. * @xa: XArray. @@ -534,6 +490,61 @@ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) return entry; } +/** + * xa_cmpxchg() - Conditionally replace an entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New value to place in array. + * @gfp: Memory allocation flags. + * + * If the entry at @index is the same as @old, replace it with @entry. + * If the return value is equal to @old, then the exchange was successful. + * + * Context: Any context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: The old value at this index or xa_err() if an error happened. + */ +static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock(xa); + curr = __xa_cmpxchg(xa, index, old, entry, gfp); + xa_unlock(xa); + + return curr; +} + +/** + * xa_insert() - Store this entry in the XArray unless another entry is + * already present. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * If you would rather see the existing entry in the array, use xa_cmpxchg(). + * This function is for users who don't care what the entry is, only that + * one is present. + * + * Context: Process context. Takes and releases the xa_lock. + * May sleep if the @gfp flags permit. + * Return: 0 if the store succeeded. -EEXIST if another entry was present. + * -ENOMEM if memory could not be allocated. + */ +static inline int xa_insert(struct xarray *xa, unsigned long index, + void *entry, gfp_t gfp) +{ + void *curr = xa_cmpxchg(xa, index, NULL, entry, gfp); + if (!curr) + return 0; + if (xa_is_err(curr)) + return xa_err(curr); + return -EEXIST; +} + /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. @@ -699,6 +710,20 @@ int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) return ret; } +/** + * xa_release() - Release a reserved entry. + * @xa: XArray. + * @index: Index of entry. + * + * After calling xa_reserve(), you can call this function to release the + * reservation. If the entry at @index has been stored to, this function + * will do nothing. + */ +static inline void xa_release(struct xarray *xa, unsigned long index) +{ + xa_cmpxchg(xa, index, NULL, NULL, 0); +} + /* Everything below here is the Advanced API. Proceed with caution. */ /* diff --git a/lib/xarray.c b/lib/xarray.c index 9cab8cfef8a8..77671d4a7910 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1406,47 +1406,6 @@ void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) } EXPORT_SYMBOL(__xa_store); -/** - * xa_cmpxchg() - Conditionally replace an entry in the XArray. - * @xa: XArray. - * @index: Index into array. - * @old: Old value to test against. - * @entry: New value to place in array. - * @gfp: Memory allocation flags. - * - * If the entry at @index is the same as @old, replace it with @entry. - * If the return value is equal to @old, then the exchange was successful. - * - * Context: Process context. Takes and releases the xa_lock. May sleep - * if the @gfp flags permit. - * Return: The old value at this index or xa_err() if an error happened. - */ -void *xa_cmpxchg(struct xarray *xa, unsigned long index, - void *old, void *entry, gfp_t gfp) -{ - XA_STATE(xas, xa, index); - void *curr; - - if (WARN_ON_ONCE(xa_is_internal(entry))) - return XA_ERROR(-EINVAL); - - do { - xas_lock(&xas); - curr = xas_load(&xas); - if (curr == XA_ZERO_ENTRY) - curr = NULL; - if (curr == old) { - xas_store(&xas, entry); - if (xa_track_free(xa) && entry) - xas_clear_mark(&xas, XA_FREE_MARK); - } - xas_unlock(&xas); - } while (xas_nomem(&xas, gfp)); - - return xas_result(&xas, curr); -} -EXPORT_SYMBOL(xa_cmpxchg); - /** * __xa_cmpxchg() - Store this entry in the XArray. * @xa: XArray. -- cgit v1.2.3-71-gd317 From 9c16bb88905456a9b1299338041f05fa7699971b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 5 Nov 2018 15:48:49 -0500 Subject: XArray: Turn xa_erase into an exported function Make xa_erase() take the spinlock and then call __xa_erase(), but make it out of line since it's such a common function. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 18 +----------------- lib/xarray.c | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 8e59d4fbd55e..4c839c17a99b 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -289,6 +289,7 @@ struct xarray { void xa_init_flags(struct xarray *, gfp_t flags); void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); +void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); @@ -340,23 +341,6 @@ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) return xa->xa_flags & XA_FLAGS_MARK(mark); } -/** - * xa_erase() - Erase this entry from the XArray. - * @xa: XArray. - * @index: Index of entry. - * - * This function is the equivalent of calling xa_store() with %NULL as - * the third argument. The XArray does not need to allocate memory, so - * the user does not need to provide GFP flags. - * - * Context: Process context. Takes and releases the xa_lock. - * Return: The entry which used to be at this index. - */ -static inline void *xa_erase(struct xarray *xa, unsigned long index) -{ - return xa_store(xa, index, NULL, 0); -} - /** * xa_for_each() - Iterate over a portion of an XArray. * @xa: XArray. diff --git a/lib/xarray.c b/lib/xarray.c index 77671d4a7910..b55aa8c1c20f 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1336,6 +1336,30 @@ void *__xa_erase(struct xarray *xa, unsigned long index) } EXPORT_SYMBOL(__xa_erase); +/** + * xa_erase() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * This function is the equivalent of calling xa_store() with %NULL as + * the third argument. The XArray does not need to allocate memory, so + * the user does not need to provide GFP flags. + * + * Context: Any context. Takes and releases the xa_lock. + * Return: The entry which used to be at this index. + */ +void *xa_erase(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock(xa); + entry = __xa_erase(xa, index); + xa_unlock(xa); + + return entry; +} +EXPORT_SYMBOL(xa_erase); + /** * xa_store() - Store this entry in the XArray. * @xa: XArray. -- cgit v1.2.3-71-gd317 From 84e5acb76dacb8ebd648a86a53907ce0dd616534 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 26 Oct 2018 14:41:29 -0400 Subject: XArray: Add xa_store_bh() and xa_store_irq() These convenience wrappers disable interrupts while taking the spinlock. A number of drivers would otherwise have to open-code these functions. Signed-off-by: Matthew Wilcox --- Documentation/core-api/xarray.rst | 5 +++- include/linux/xarray.h | 52 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index 65c77a81b689..8a6e2087de77 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -167,6 +167,8 @@ Takes RCU read lock: Takes xa_lock internally: * :c:func:`xa_store` + * :c:func:`xa_store_bh` + * :c:func:`xa_store_irq` * :c:func:`xa_insert` * :c:func:`xa_erase` * :c:func:`xa_erase_bh` @@ -247,7 +249,8 @@ Sharing the XArray with interrupt context is also possible, either using :c:func:`xa_lock_irqsave` in both the interrupt handler and process context, or :c:func:`xa_lock_irq` in process context and :c:func:`xa_lock` in the interrupt handler. Some of the more common patterns have helper -functions such as :c:func:`xa_erase_bh` and :c:func:`xa_erase_irq`. +functions such as :c:func:`xa_store_bh`, :c:func:`xa_store_irq`, +:c:func:`xa_erase_bh` and :c:func:`xa_erase_irq`. Sometimes you need to protect access to the XArray with a mutex because that lock sits above another mutex in the locking hierarchy. That does diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 4c839c17a99b..52d9732e4ec4 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -426,6 +426,58 @@ static inline int __xa_insert(struct xarray *xa, unsigned long index, return -EEXIST; } +/** + * xa_store_bh() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * This function is like calling xa_store() except it disables softirqs + * while holding the array lock. + * + * Context: Any context. Takes and releases the xa_lock while + * disabling softirqs. + * Return: The entry which used to be at this index. + */ +static inline void *xa_store_bh(struct xarray *xa, unsigned long index, + void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock_bh(xa); + curr = __xa_store(xa, index, entry, gfp); + xa_unlock_bh(xa); + + return curr; +} + +/** + * xa_store_irq() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * This function is like calling xa_store() except it disables interrupts + * while holding the array lock. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. + * Return: The entry which used to be at this index. + */ +static inline void *xa_store_irq(struct xarray *xa, unsigned long index, + void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock_irq(xa); + curr = __xa_store(xa, index, entry, gfp); + xa_unlock_irq(xa); + + return curr; +} + /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. -- cgit v1.2.3-71-gd317 From 804dfaf01bcc9daa4298c608ba9018abf616ec48 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 5 Nov 2018 16:37:15 -0500 Subject: XArray: Fix Documentation Minor fixes. Signed-off-by: Matthew Wilcox --- Documentation/core-api/xarray.rst | 6 +++++- include/linux/xarray.h | 4 ++-- lib/xarray.c | 10 +++++----- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index 616ac406bf86..dbe96cb5558e 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -74,7 +74,8 @@ using :c:func:`xa_load`. xa_store will overwrite any entry with the new entry and return the previous entry stored at that index. You can use :c:func:`xa_erase` instead of calling :c:func:`xa_store` with a ``NULL`` entry. There is no difference between an entry that has never -been stored to and one that has most recently had ``NULL`` stored to it. +been stored to, one that has been erased and one that has most recently +had ``NULL`` stored to it. You can conditionally replace an entry at an index by using :c:func:`xa_cmpxchg`. Like :c:func:`cmpxchg`, it will only succeed if @@ -114,6 +115,9 @@ unused entry. If another user has stored to the entry in the meantime, :c:func:`xa_release` will do nothing; if instead you want the entry to become ``NULL``, you should use :c:func:`xa_erase`. +If all entries in the array are ``NULL``, the :c:func:`xa_empty` function +will return ``true``. + Finally, you can remove all entries from an XArray by calling :c:func:`xa_destroy`. If the XArray entries are pointers, you may wish to free the entries first. You can do this by iterating over all present diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 52d9732e4ec4..564892e19f8c 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -487,7 +487,7 @@ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, * the third argument. The XArray does not need to allocate memory, so * the user does not need to provide GFP flags. * - * Context: Process context. Takes and releases the xa_lock while + * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ @@ -622,7 +622,7 @@ static inline int xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry, * Updates the @id pointer with the index, then stores the entry at that * index. A concurrent lookup will not see an uninitialised @id. * - * Context: Process context. Takes and releases the xa_lock while + * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if * there is no more space in the XArray. diff --git a/lib/xarray.c b/lib/xarray.c index c3e2084aa313..7946380cd6c9 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -610,8 +610,8 @@ static int xas_expand(struct xa_state *xas, void *head) * (see the xa_cmpxchg() implementation for an example). * * Return: If the slot already existed, returns the contents of this slot. - * If the slot was newly created, returns NULL. If it failed to create the - * slot, returns NULL and indicates the error in @xas. + * If the slot was newly created, returns %NULL. If it failed to create the + * slot, returns %NULL and indicates the error in @xas. */ static void *xas_create(struct xa_state *xas) { @@ -1640,7 +1640,7 @@ EXPORT_SYMBOL(__xa_alloc); * @index: Index of entry. * @mark: Mark number. * - * Attempting to set a mark on a NULL entry does not succeed. + * Attempting to set a mark on a %NULL entry does not succeed. * * Context: Any context. Expects xa_lock to be held on entry. */ @@ -1710,7 +1710,7 @@ EXPORT_SYMBOL(xa_get_mark); * @index: Index of entry. * @mark: Mark number. * - * Attempting to set a mark on a NULL entry does not succeed. + * Attempting to set a mark on a %NULL entry does not succeed. * * Context: Process context. Takes and releases the xa_lock. */ @@ -1879,7 +1879,7 @@ static unsigned int xas_extract_marked(struct xa_state *xas, void **dst, * * The @filter may be an XArray mark value, in which case entries which are * marked with that mark will be copied. It may also be %XA_PRESENT, in - * which case all entries which are not NULL will be copied. + * which case all entries which are not %NULL will be copied. * * The entries returned may not represent a snapshot of the XArray at a * moment in time. For example, if another thread stores to index 5, then -- cgit v1.2.3-71-gd317 From 8bd66d147c88bd441178c7b4c774ae5a185f19b8 Mon Sep 17 00:00:00 2001 From: "ndesaulniers@google.com" Date: Wed, 31 Oct 2018 12:39:01 -0700 Subject: include/linux/compiler*.h: define asm_volatile_goto asm_volatile_goto should also be defined for other compilers that support asm goto. Fixes commit 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive"). Signed-off-by: Nick Desaulniers Signed-off-by: Miguel Ojeda --- include/linux/compiler_types.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 3439d7d0249a..4a3f9c09c92d 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -130,6 +130,10 @@ struct ftrace_likely_data { # define randomized_struct_fields_end #endif +#ifndef asm_volatile_goto +#define asm_volatile_goto(x...) asm goto(x) +#endif + /* Are two types/vars the same type (ignoring qualifiers)? */ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) -- cgit v1.2.3-71-gd317 From 98ee3fc7ef8395f8b7a379e6608aee91efc66d48 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 6 Nov 2018 17:25:37 +0100 Subject: mtd: nand: Fix nanddev_pos_next_page() kernel-doc header Function name is wrong in the kernel-doc header. Fixes: 9c3736a3de21 ("mtd: nand: Add core infrastructure to deal with NAND devices") Signed-off-by: Boris Brezillon Reviewed-by: Miquel Raynal --- include/linux/mtd/nand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 78b86dea2f29..7f53ece2c039 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -568,7 +568,7 @@ static inline void nanddev_pos_next_eraseblock(struct nand_device *nand, } /** - * nanddev_pos_next_eraseblock() - Move a position to the next page + * nanddev_pos_next_page() - Move a position to the next page * @nand: NAND device * @pos: the position to update * -- cgit v1.2.3-71-gd317 From 81bd415c91eb966118d773dddf254aebf3022411 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 6 Jun 2018 21:42:32 +0200 Subject: watchdog/core: Add missing prototypes for weak functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The split out of the hard lockup detector exposed two new weak functions, but no prototypes for them, which triggers the build warning: kernel/watchdog.c:109:12: warning: no previous prototype for ‘watchdog_nmi_enable’ [-Wmissing-prototypes] kernel/watchdog.c:115:13: warning: no previous prototype for ‘watchdog_nmi_disable’ [-Wmissing-prototypes] Add the prototypes. Fixes: 73ce0511c436 ("kernel/watchdog.c: move hardlockup detector to separate file") Signed-off-by: Mathieu Malaterre Signed-off-by: Thomas Gleixner Cc: Babu Moger Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180606194232.17653-1-malat@debian.org --- include/linux/nmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 08f9247e9827..9003e29cde46 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -119,6 +119,8 @@ static inline int hardlockup_detector_perf_init(void) { return 0; } void watchdog_nmi_stop(void); void watchdog_nmi_start(void); int watchdog_nmi_probe(void); +int watchdog_nmi_enable(unsigned int cpu); +void watchdog_nmi_disable(unsigned int cpu); /** * touch_nmi_watchdog - restart NMI watchdog timeout. -- cgit v1.2.3-71-gd317 From 99b77fef3c6c69bb7664f1ac97a69d5b17968dae Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 31 Oct 2018 12:20:28 +0200 Subject: net/mlx5: Fix XRC SRQ umem valid bits Adapt XRC SRQ to the latest HW specification with fixed definition around umem valid bits. The previous definition relied on a bit which was taken for other purposes in legacy FW. Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index dbff9ff28f2c..34e17e6f8942 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2473,14 +2473,15 @@ struct mlx5_ifc_xrc_srqc_bits { u8 wq_signature[0x1]; u8 cont_srq[0x1]; - u8 dbr_umem_valid[0x1]; + u8 reserved_at_22[0x1]; u8 rlky[0x1]; u8 basic_cyclic_rcv_wqe[0x1]; u8 log_rq_stride[0x3]; u8 xrcd[0x18]; u8 page_offset[0x6]; - u8 reserved_at_46[0x2]; + u8 reserved_at_46[0x1]; + u8 dbr_umem_valid[0x1]; u8 cqn[0x18]; u8 reserved_at_60[0x20]; @@ -6689,9 +6690,12 @@ struct mlx5_ifc_create_xrc_srq_in_bits { struct mlx5_ifc_xrc_srqc_bits xrc_srq_context_entry; - u8 reserved_at_280[0x40]; + u8 reserved_at_280[0x60]; + u8 xrc_srq_umem_valid[0x1]; - u8 reserved_at_2c1[0x5bf]; + u8 reserved_at_2e1[0x1f]; + + u8 reserved_at_300[0x580]; u8 pas[0][0x40]; }; -- cgit v1.2.3-71-gd317 From 781f0766cc41a9dd2e5d118ef4b1d5d89430257b Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 19 Oct 2018 16:14:50 +0800 Subject: USB: Wait for extra delay time after USB_PORT_FEAT_RESET for quirky hub Devices connected under Terminus Technology Inc. Hub (1a40:0101) may fail to work after the system resumes from suspend: [ 206.063325] usb 3-2.4: reset full-speed USB device number 4 using xhci_hcd [ 206.143691] usb 3-2.4: device descriptor read/64, error -32 [ 206.351671] usb 3-2.4: device descriptor read/64, error -32 Info for this hub: T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 2 Spd=480 MxCh= 4 D: Ver= 2.00 Cls=09(hub ) Sub=00 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=1a40 ProdID=0101 Rev=01.11 S: Product=USB 2.0 Hub C: #Ifs= 1 Cfg#= 1 Atr=e0 MxPwr=100mA I: If#= 0 Alt= 0 #EPs= 1 Cls=09(hub ) Sub=00 Prot=00 Driver=hub Some expirements indicate that the USB devices connected to the hub are innocent, it's the hub itself is to blame. The hub needs extra delay time after it resets its port. Hence wait for extra delay, if the device is connected to this quirky hub. Signed-off-by: Kai-Heng Feng Cc: stable Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- Documentation/admin-guide/kernel-parameters.txt | 2 ++ drivers/usb/core/hub.c | 14 +++++++++++--- drivers/usb/core/quirks.c | 6 ++++++ include/linux/usb/quirks.h | 3 +++ 4 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 81d1d5a74728..19f4423e70d9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4713,6 +4713,8 @@ prevent spurious wakeup); n = USB_QUIRK_DELAY_CTRL_MSG (Device needs a pause after every control message); + o = USB_QUIRK_HUB_SLOW_RESET (Hub needs extra + delay after resetting its port); Example: quirks=0781:5580:bk,0a5c:5834:gij usbhid.mousepoll= diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index c6077d582d29..d9bd7576786a 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -2794,6 +2794,7 @@ static int hub_port_reset(struct usb_hub *hub, int port1, int i, status; u16 portchange, portstatus; struct usb_port *port_dev = hub->ports[port1 - 1]; + int reset_recovery_time; if (!hub_is_superspeed(hub->hdev)) { if (warm) { @@ -2885,11 +2886,18 @@ static int hub_port_reset(struct usb_hub *hub, int port1, done: if (status == 0) { - /* TRSTRCY = 10 ms; plus some extra */ if (port_dev->quirks & USB_PORT_QUIRK_FAST_ENUM) usleep_range(10000, 12000); - else - msleep(10 + 40); + else { + /* TRSTRCY = 10 ms; plus some extra */ + reset_recovery_time = 10 + 40; + + /* Hub needs extra delay after resetting its port. */ + if (hub->hdev->quirks & USB_QUIRK_HUB_SLOW_RESET) + reset_recovery_time += 100; + + msleep(reset_recovery_time); + } if (udev) { struct usb_hcd *hcd = bus_to_hcd(udev->bus); diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 178d6c6063c0..4d7d948eae63 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -128,6 +128,9 @@ static int quirks_param_set(const char *val, const struct kernel_param *kp) case 'n': flags |= USB_QUIRK_DELAY_CTRL_MSG; break; + case 'o': + flags |= USB_QUIRK_HUB_SLOW_RESET; + break; /* Ignore unrecognized flag characters */ } } @@ -380,6 +383,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1a0a, 0x0200), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, + /* Terminus Technology Inc. Hub */ + { USB_DEVICE(0x1a40, 0x0101), .driver_info = USB_QUIRK_HUB_SLOW_RESET }, + /* Corsair K70 RGB */ { USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT }, diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index b7a99ce56bc9..a1be64c9940f 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -66,4 +66,7 @@ /* Device needs a pause after every control message. */ #define USB_QUIRK_DELAY_CTRL_MSG BIT(13) +/* Hub needs extra delay after resetting its port. */ +#define USB_QUIRK_HUB_SLOW_RESET BIT(14) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3-71-gd317 From 24efee412c75843755da0ddd7bbc2db2ce9129f5 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 6 Nov 2018 22:52:11 +0100 Subject: Compiler Attributes: improve explanation of header Explain better what "optional" attributes are, and avoid calling them so to avoid confusion. Simply retain "Optional" as a word to look for in the comments. Moreover, add a couple sentences to explain a bit more the intention and the documentation links. Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 6b28c1b7310c..f8c400ba1929 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -4,22 +4,26 @@ /* * The attributes in this file are unconditionally defined and they directly - * map to compiler attribute(s) -- except those that are optional. + * map to compiler attribute(s), unless one of the compilers does not support + * the attribute. In that case, __has_attribute is used to check for support + * and the reason is stated in its comment ("Optional: ..."). * * Any other "attributes" (i.e. those that depend on a configuration option, * on a compiler, on an architecture, on plugins, on other attributes...) * should be defined elsewhere (e.g. compiler_types.h or compiler-*.h). + * The intention is to keep this file as simple as possible, as well as + * compiler- and version-agnostic (e.g. avoiding GCC_VERSION checks). * * This file is meant to be sorted (by actual attribute name, * not by #define identifier). Use the __attribute__((__name__)) syntax * (i.e. with underscores) to avoid future collisions with other macros. - * If an attribute is optional, state the reason in the comment. + * Provide links to the documentation of each supported compiler, if it exists. */ /* - * To check for optional attributes, we use __has_attribute, which is supported - * on gcc >= 5, clang >= 2.9 and icc >= 17. In the meantime, to support - * 4.6 <= gcc < 5, we implement __has_attribute by hand. + * __has_attribute is supported on gcc >= 5, clang >= 2.9 and icc >= 17. + * In the meantime, to support 4.6 <= gcc < 5, we implement __has_attribute + * by hand. * * sparse does not support __has_attribute (yet) and defines __GNUC_MINOR__ * depending on the compiler used to build it; however, these attributes have -- cgit v1.2.3-71-gd317 From 23c625ce3065e40c933a4239efb9b11f1194a343 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 8 Nov 2018 14:55:21 +0100 Subject: libceph: assume argonaut on the server side No one is running pre-argonaut. In addition one of the argonaut features (NOSRCADDR) has been required since day one (and a half, 2.6.34 vs 2.6.35) of the kernel client. Allow for the possibility of reusing these feature bits later. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 12 +++--------- include/linux/ceph/ceph_features.h | 8 +------- 2 files changed, 4 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 67a9aeb2f4ec..bd13a3267ae0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -80,12 +80,8 @@ static int parse_reply_info_in(void **p, void *end, info->symlink = *p; *p += info->symlink_len; - if (features & CEPH_FEATURE_DIRLAYOUTHASH) - ceph_decode_copy_safe(p, end, &info->dir_layout, - sizeof(info->dir_layout), bad); - else - memset(&info->dir_layout, 0, sizeof(info->dir_layout)); - + ceph_decode_copy_safe(p, end, &info->dir_layout, + sizeof(info->dir_layout), bad); ceph_decode_32_safe(p, end, info->xattr_len, bad); ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; @@ -3182,10 +3178,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, recon_state.pagelist = pagelist; if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) recon_state.msg_version = 3; - else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK) - recon_state.msg_version = 2; else - recon_state.msg_version = 1; + recon_state.msg_version = 2; err = iterate_session_caps(session, encode_caps_cb, &recon_state); if (err < 0) goto fail; diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 6b92b3395fa9..65a38c4a02a1 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -213,12 +213,6 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \ CEPH_FEATURE_CEPHX_V2) -#define CEPH_FEATURES_REQUIRED_DEFAULT \ - (CEPH_FEATURE_NOSRCADDR | \ - CEPH_FEATURE_SUBSCRIBE2 | \ - CEPH_FEATURE_RECONNECT_SEQ | \ - CEPH_FEATURE_PGID64 | \ - CEPH_FEATURE_PGPOOL3 | \ - CEPH_FEATURE_OSDENC) +#define CEPH_FEATURES_REQUIRED_DEFAULT 0 #endif -- cgit v1.2.3-71-gd317 From a4310fa2f24687888ce80fdb0e88583561a23700 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 31 Oct 2018 10:37:46 +0100 Subject: can: dev: can_get_echo_skb(): factor out non sending code to __can_get_echo_skb() This patch factors out all non sending parts of can_get_echo_skb() into a seperate function __can_get_echo_skb(), so that it can be re-used in an upcoming patch. Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev.c | 36 +++++++++++++++++++++++++----------- include/linux/can/dev.h | 1 + 2 files changed, 26 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 49163570a63a..80530ab37b1e 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -477,14 +477,7 @@ void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, } EXPORT_SYMBOL_GPL(can_put_echo_skb); -/* - * Get the skb from the stack and loop it back locally - * - * The function is typically called when the TX done interrupt - * is handled in the device driver. The driver must protect - * access to priv->echo_skb, if necessary. - */ -unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx) +struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr) { struct can_priv *priv = netdev_priv(dev); @@ -495,13 +488,34 @@ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx) struct can_frame *cf = (struct can_frame *)skb->data; u8 dlc = cf->can_dlc; - netif_rx(priv->echo_skb[idx]); + *len_ptr = dlc; priv->echo_skb[idx] = NULL; - return dlc; + return skb; } - return 0; + return NULL; +} + +/* + * Get the skb from the stack and loop it back locally + * + * The function is typically called when the TX done interrupt + * is handled in the device driver. The driver must protect + * access to priv->echo_skb, if necessary. + */ +unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx) +{ + struct sk_buff *skb; + u8 len; + + skb = __can_get_echo_skb(dev, idx, &len); + if (!skb) + return 0; + + netif_rx(skb); + + return len; } EXPORT_SYMBOL_GPL(can_get_echo_skb); diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index a83e1f632eb7..f01623aef2f7 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -169,6 +169,7 @@ void can_change_state(struct net_device *dev, struct can_frame *cf, void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx); +struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr); unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx); void can_free_echo_skb(struct net_device *dev, unsigned int idx); -- cgit v1.2.3-71-gd317 From 55059f2b7f868cd43b3ad30e28e18347e1b46ace Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 18 Sep 2018 11:40:38 +0200 Subject: can: rx-offload: introduce can_rx_offload_get_echo_skb() and can_rx_offload_queue_sorted() functions Current CAN framework can't guarantee proper/chronological order of RX and TX-ECHO messages. To make this possible, drivers should use this functions instead of can_get_echo_skb(). Signed-off-by: Oleksij Rempel Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- drivers/net/can/rx-offload.c | 46 ++++++++++++++++++++++++++++++++++++++++++ include/linux/can/rx-offload.h | 4 ++++ 2 files changed, 50 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/can/rx-offload.c b/drivers/net/can/rx-offload.c index c7d05027a7a0..c368686e2164 100644 --- a/drivers/net/can/rx-offload.c +++ b/drivers/net/can/rx-offload.c @@ -211,6 +211,52 @@ int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload) } EXPORT_SYMBOL_GPL(can_rx_offload_irq_offload_fifo); +int can_rx_offload_queue_sorted(struct can_rx_offload *offload, + struct sk_buff *skb, u32 timestamp) +{ + struct can_rx_offload_cb *cb; + unsigned long flags; + + if (skb_queue_len(&offload->skb_queue) > + offload->skb_queue_len_max) + return -ENOMEM; + + cb = can_rx_offload_get_cb(skb); + cb->timestamp = timestamp; + + spin_lock_irqsave(&offload->skb_queue.lock, flags); + __skb_queue_add_sort(&offload->skb_queue, skb, can_rx_offload_compare); + spin_unlock_irqrestore(&offload->skb_queue.lock, flags); + + can_rx_offload_schedule(offload); + + return 0; +} +EXPORT_SYMBOL_GPL(can_rx_offload_queue_sorted); + +unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, + unsigned int idx, u32 timestamp) +{ + struct net_device *dev = offload->dev; + struct net_device_stats *stats = &dev->stats; + struct sk_buff *skb; + u8 len; + int err; + + skb = __can_get_echo_skb(dev, idx, &len); + if (!skb) + return 0; + + err = can_rx_offload_queue_sorted(offload, skb, timestamp); + if (err) { + stats->rx_errors++; + stats->tx_fifo_errors++; + } + + return len; +} +EXPORT_SYMBOL_GPL(can_rx_offload_get_echo_skb); + int can_rx_offload_irq_queue_err_skb(struct can_rx_offload *offload, struct sk_buff *skb) { if (skb_queue_len(&offload->skb_queue) > diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index cb31683bbe15..01a7c9e5d8d8 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -41,6 +41,10 @@ int can_rx_offload_add_timestamp(struct net_device *dev, struct can_rx_offload * int can_rx_offload_add_fifo(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight); int can_rx_offload_irq_offload_timestamp(struct can_rx_offload *offload, u64 reg); int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload); +int can_rx_offload_queue_sorted(struct can_rx_offload *offload, + struct sk_buff *skb, u32 timestamp); +unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, + unsigned int idx, u32 timestamp); int can_rx_offload_irq_queue_err_skb(struct can_rx_offload *offload, struct sk_buff *skb); void can_rx_offload_reset(struct can_rx_offload *offload); void can_rx_offload_del(struct can_rx_offload *offload); -- cgit v1.2.3-71-gd317 From 4530ec36bb1e0d24f41c33229694adacda3d5d89 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 18 Sep 2018 11:40:40 +0200 Subject: can: rx-offload: rename can_rx_offload_irq_queue_err_skb() to can_rx_offload_queue_tail() This function has nothing todo with error. Signed-off-by: Oleksij Rempel Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan.c | 4 ++-- drivers/net/can/rx-offload.c | 5 +++-- include/linux/can/rx-offload.h | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 41a175f80c4b..5923bd0ec118 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -610,7 +610,7 @@ static void flexcan_irq_bus_err(struct net_device *dev, u32 reg_esr) if (tx_errors) dev->stats.tx_errors++; - can_rx_offload_irq_queue_err_skb(&priv->offload, skb); + can_rx_offload_queue_tail(&priv->offload, skb); } static void flexcan_irq_state(struct net_device *dev, u32 reg_esr) @@ -650,7 +650,7 @@ static void flexcan_irq_state(struct net_device *dev, u32 reg_esr) if (unlikely(new_state == CAN_STATE_BUS_OFF)) can_bus_off(dev); - can_rx_offload_irq_queue_err_skb(&priv->offload, skb); + can_rx_offload_queue_tail(&priv->offload, skb); } static inline struct flexcan_priv *rx_offload_to_priv(struct can_rx_offload *offload) diff --git a/drivers/net/can/rx-offload.c b/drivers/net/can/rx-offload.c index c368686e2164..2ce4fa8698c7 100644 --- a/drivers/net/can/rx-offload.c +++ b/drivers/net/can/rx-offload.c @@ -257,7 +257,8 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, } EXPORT_SYMBOL_GPL(can_rx_offload_get_echo_skb); -int can_rx_offload_irq_queue_err_skb(struct can_rx_offload *offload, struct sk_buff *skb) +int can_rx_offload_queue_tail(struct can_rx_offload *offload, + struct sk_buff *skb) { if (skb_queue_len(&offload->skb_queue) > offload->skb_queue_len_max) @@ -268,7 +269,7 @@ int can_rx_offload_irq_queue_err_skb(struct can_rx_offload *offload, struct sk_b return 0; } -EXPORT_SYMBOL_GPL(can_rx_offload_irq_queue_err_skb); +EXPORT_SYMBOL_GPL(can_rx_offload_queue_tail); static int can_rx_offload_init_queue(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight) { diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index 01a7c9e5d8d8..8268811a697e 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -45,7 +45,8 @@ int can_rx_offload_queue_sorted(struct can_rx_offload *offload, struct sk_buff *skb, u32 timestamp); unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, unsigned int idx, u32 timestamp); -int can_rx_offload_irq_queue_err_skb(struct can_rx_offload *offload, struct sk_buff *skb); +int can_rx_offload_queue_tail(struct can_rx_offload *offload, + struct sk_buff *skb); void can_rx_offload_reset(struct can_rx_offload *offload); void can_rx_offload_del(struct can_rx_offload *offload); void can_rx_offload_enable(struct can_rx_offload *offload); -- cgit v1.2.3-71-gd317 From d19f9130b814d33c03118493c17454f7d90075d1 Mon Sep 17 00:00:00 2001 From: Elvira Khabirova Date: Sat, 10 Nov 2018 04:22:09 +0100 Subject: x86/ptrace: Fix documentation for tracehook_report_syscall_entry() tracehook_report_syscall_entry() is called not only if %TIF_SYSCALL_TRACE is set, but also if %TIF_SYSCALL_EMU is set, as appears from x86's entry code. Signed-off-by: Elvira Khabirova Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: ldv@altlinux.org Cc: oleg@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20181110042209.26333972@akathisia Signed-off-by: Ingo Molnar --- include/linux/tracehook.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 40b0b4c1bf7b..df20f8bdbfa3 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -83,8 +83,8 @@ static inline int ptrace_report_syscall(struct pt_regs *regs) * tracehook_report_syscall_entry - task is about to attempt a system call * @regs: user register state of current task * - * This will be called if %TIF_SYSCALL_TRACE has been set, when the - * current task has just entered the kernel for a system call. + * This will be called if %TIF_SYSCALL_TRACE or %TIF_SYSCALL_EMU have been set, + * when the current task has just entered the kernel for a system call. * Full user register state is available here. Changing the values * in @regs can affect the system call number and arguments to be tried. * It is safe to block here, preventing the system call from beginning. -- cgit v1.2.3-71-gd317 From eff896288872d687d9662000ec9ae11b6d61766f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 14 Nov 2018 09:55:43 -0800 Subject: efi/arm: Defer persistent reservations until after paging_init() The new memory EFI reservation feature we introduced to allow memory reservations to persist across kexec may trigger an unbounded number of calls to memblock_reserve(). The memblock subsystem can deal with this fine, but not before memblock resizing is enabled, which we can only do after paging_init(), when the memory we reallocate the array into is actually mapped. So break out the memreserve table processing into a separate routine and call it after paging_init() on arm64. On ARM, because of limited reviewing bandwidth of the maintainer, we cannot currently fix this, so instead, disable the EFI persistent memreserve entirely on ARM so we can fix it later. Tested-by: Marc Zyngier Signed-off-by: Ard Biesheuvel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181114175544.12860-5-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- arch/arm64/kernel/setup.c | 1 + drivers/firmware/efi/efi.c | 4 ++++ drivers/firmware/efi/libstub/arm-stub.c | 3 +++ include/linux/efi.h | 7 +++++++ 4 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 953e316521fc..f4fc1e0544b7 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -313,6 +313,7 @@ void __init setup_arch(char **cmdline_p) arm64_memblock_init(); paging_init(); + efi_apply_persistent_mem_reservations(); acpi_table_upgrade(); diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 249eb70691b0..72a4da76d274 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -592,7 +592,11 @@ int __init efi_config_parse_tables(void *config_tables, int count, int sz, early_memunmap(tbl, sizeof(*tbl)); } + return 0; +} +int __init efi_apply_persistent_mem_reservations(void) +{ if (efi.mem_reserve != EFI_INVALID_TABLE_ADDR) { unsigned long prsv = efi.mem_reserve; diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 30ac0c975f8a..3d36142cf812 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -75,6 +75,9 @@ void install_memreserve_table(efi_system_table_t *sys_table_arg) efi_guid_t memreserve_table_guid = LINUX_EFI_MEMRESERVE_TABLE_GUID; efi_status_t status; + if (IS_ENABLED(CONFIG_ARM)) + return; + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, sizeof(*rsv), (void **)&rsv); if (status != EFI_SUCCESS) { diff --git a/include/linux/efi.h b/include/linux/efi.h index 845174e113ce..100ce4a4aff6 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1167,6 +1167,8 @@ static inline bool efi_enabled(int feature) extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused); extern bool efi_is_table_address(unsigned long phys_addr); + +extern int efi_apply_persistent_mem_reservations(void); #else static inline bool efi_enabled(int feature) { @@ -1185,6 +1187,11 @@ static inline bool efi_is_table_address(unsigned long phys_addr) { return false; } + +static inline int efi_apply_persistent_mem_reservations(void) +{ + return 0; +} #endif extern int efi_status_to_err(efi_status_t status); -- cgit v1.2.3-71-gd317 From 0145b50566e7de5637e80ecba96c7f0e6fff1aad Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 31 Oct 2018 15:20:05 +0100 Subject: iio/hid-sensors: Fix IIO_CHAN_INFO_RAW returning wrong values for signed numbers Before this commit sensor_hub_input_attr_get_raw_value() failed to take the signedness of 16 and 8 bit values into account, returning e.g. 65436 instead of -100 for the z-axis reading of an accelerometer. This commit adds a new is_signed parameter to the function and makes all callers pass the appropriate value for this. While at it, this commit also fixes up some neighboring lines where statements were needlessly split over 2 lines to improve readability. Signed-off-by: Hans de Goede Acked-by: Srinivas Pandruvada Acked-by: Benjamin Tissoires Cc: Signed-off-by: Jonathan Cameron --- drivers/hid/hid-sensor-custom.c | 2 +- drivers/hid/hid-sensor-hub.c | 13 ++++++++++--- drivers/iio/accel/hid-sensor-accel-3d.c | 5 ++++- drivers/iio/gyro/hid-sensor-gyro-3d.c | 5 ++++- drivers/iio/humidity/hid-sensor-humidity.c | 3 ++- drivers/iio/light/hid-sensor-als.c | 8 +++++--- drivers/iio/light/hid-sensor-prox.c | 8 +++++--- drivers/iio/magnetometer/hid-sensor-magn-3d.c | 8 +++++--- drivers/iio/orientation/hid-sensor-incl-3d.c | 8 +++++--- drivers/iio/pressure/hid-sensor-press.c | 8 +++++--- drivers/iio/temperature/hid-sensor-temperature.c | 3 ++- drivers/rtc/rtc-hid-sensor-time.c | 2 +- include/linux/hid-sensor-hub.h | 4 +++- 13 files changed, 52 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/hid-sensor-custom.c b/drivers/hid/hid-sensor-custom.c index e8a114157f87..bb012bc032e0 100644 --- a/drivers/hid/hid-sensor-custom.c +++ b/drivers/hid/hid-sensor-custom.c @@ -358,7 +358,7 @@ static ssize_t show_value(struct device *dev, struct device_attribute *attr, sensor_inst->hsdev, sensor_inst->hsdev->usage, usage, report_id, - SENSOR_HUB_SYNC); + SENSOR_HUB_SYNC, false); } else if (!strncmp(name, "units", strlen("units"))) value = sensor_inst->fields[field_index].attribute.units; else if (!strncmp(name, "unit-expo", strlen("unit-expo"))) diff --git a/drivers/hid/hid-sensor-hub.c b/drivers/hid/hid-sensor-hub.c index 2b63487057c2..4256fdc5cd6d 100644 --- a/drivers/hid/hid-sensor-hub.c +++ b/drivers/hid/hid-sensor-hub.c @@ -299,7 +299,8 @@ EXPORT_SYMBOL_GPL(sensor_hub_get_feature); int sensor_hub_input_attr_get_raw_value(struct hid_sensor_hub_device *hsdev, u32 usage_id, u32 attr_usage_id, u32 report_id, - enum sensor_hub_read_flags flag) + enum sensor_hub_read_flags flag, + bool is_signed) { struct sensor_hub_data *data = hid_get_drvdata(hsdev->hdev); unsigned long flags; @@ -331,10 +332,16 @@ int sensor_hub_input_attr_get_raw_value(struct hid_sensor_hub_device *hsdev, &hsdev->pending.ready, HZ*5); switch (hsdev->pending.raw_size) { case 1: - ret_val = *(u8 *)hsdev->pending.raw_data; + if (is_signed) + ret_val = *(s8 *)hsdev->pending.raw_data; + else + ret_val = *(u8 *)hsdev->pending.raw_data; break; case 2: - ret_val = *(u16 *)hsdev->pending.raw_data; + if (is_signed) + ret_val = *(s16 *)hsdev->pending.raw_data; + else + ret_val = *(u16 *)hsdev->pending.raw_data; break; case 4: ret_val = *(u32 *)hsdev->pending.raw_data; diff --git a/drivers/iio/accel/hid-sensor-accel-3d.c b/drivers/iio/accel/hid-sensor-accel-3d.c index 41d97faf5013..38ff374a3ca4 100644 --- a/drivers/iio/accel/hid-sensor-accel-3d.c +++ b/drivers/iio/accel/hid-sensor-accel-3d.c @@ -149,6 +149,7 @@ static int accel_3d_read_raw(struct iio_dev *indio_dev, int report_id = -1; u32 address; int ret_type; + s32 min; struct hid_sensor_hub_device *hsdev = accel_state->common_attributes.hsdev; @@ -158,12 +159,14 @@ static int accel_3d_read_raw(struct iio_dev *indio_dev, case IIO_CHAN_INFO_RAW: hid_sensor_power_state(&accel_state->common_attributes, true); report_id = accel_state->accel[chan->scan_index].report_id; + min = accel_state->accel[chan->scan_index].logical_minimum; address = accel_3d_addresses[chan->scan_index]; if (report_id >= 0) *val = sensor_hub_input_attr_get_raw_value( accel_state->common_attributes.hsdev, hsdev->usage, address, report_id, - SENSOR_HUB_SYNC); + SENSOR_HUB_SYNC, + min < 0); else { *val = 0; hid_sensor_power_state(&accel_state->common_attributes, diff --git a/drivers/iio/gyro/hid-sensor-gyro-3d.c b/drivers/iio/gyro/hid-sensor-gyro-3d.c index 36941e69f959..88e857c4baf4 100644 --- a/drivers/iio/gyro/hid-sensor-gyro-3d.c +++ b/drivers/iio/gyro/hid-sensor-gyro-3d.c @@ -111,6 +111,7 @@ static int gyro_3d_read_raw(struct iio_dev *indio_dev, int report_id = -1; u32 address; int ret_type; + s32 min; *val = 0; *val2 = 0; @@ -118,13 +119,15 @@ static int gyro_3d_read_raw(struct iio_dev *indio_dev, case IIO_CHAN_INFO_RAW: hid_sensor_power_state(&gyro_state->common_attributes, true); report_id = gyro_state->gyro[chan->scan_index].report_id; + min = gyro_state->gyro[chan->scan_index].logical_minimum; address = gyro_3d_addresses[chan->scan_index]; if (report_id >= 0) *val = sensor_hub_input_attr_get_raw_value( gyro_state->common_attributes.hsdev, HID_USAGE_SENSOR_GYRO_3D, address, report_id, - SENSOR_HUB_SYNC); + SENSOR_HUB_SYNC, + min < 0); else { *val = 0; hid_sensor_power_state(&gyro_state->common_attributes, diff --git a/drivers/iio/humidity/hid-sensor-humidity.c b/drivers/iio/humidity/hid-sensor-humidity.c index beab6d6fd6e1..4bc95f31c730 100644 --- a/drivers/iio/humidity/hid-sensor-humidity.c +++ b/drivers/iio/humidity/hid-sensor-humidity.c @@ -75,7 +75,8 @@ static int humidity_read_raw(struct iio_dev *indio_dev, HID_USAGE_SENSOR_HUMIDITY, HID_USAGE_SENSOR_ATMOSPHERIC_HUMIDITY, humid_st->humidity_attr.report_id, - SENSOR_HUB_SYNC); + SENSOR_HUB_SYNC, + humid_st->humidity_attr.logical_minimum < 0); hid_sensor_power_state(&humid_st->common_attributes, false); return IIO_VAL_INT; diff --git a/drivers/iio/light/hid-sensor-als.c b/drivers/iio/light/hid-sensor-als.c index 406caaee9a3c..94f33250ba5a 100644 --- a/drivers/iio/light/hid-sensor-als.c +++ b/drivers/iio/light/hid-sensor-als.c @@ -93,6 +93,7 @@ static int als_read_raw(struct iio_dev *indio_dev, int report_id = -1; u32 address; int ret_type; + s32 min; *val = 0; *val2 = 0; @@ -102,8 +103,8 @@ static int als_read_raw(struct iio_dev *indio_dev, case CHANNEL_SCAN_INDEX_INTENSITY: case CHANNEL_SCAN_INDEX_ILLUM: report_id = als_state->als_illum.report_id; - address = - HID_USAGE_SENSOR_LIGHT_ILLUM; + min = als_state->als_illum.logical_minimum; + address = HID_USAGE_SENSOR_LIGHT_ILLUM; break; default: report_id = -1; @@ -116,7 +117,8 @@ static int als_read_raw(struct iio_dev *indio_dev, als_state->common_attributes.hsdev, HID_USAGE_SENSOR_ALS, address, report_id, - SENSOR_HUB_SYNC); + SENSOR_HUB_SYNC, + min < 0); hid_sensor_power_state(&als_state->common_attributes, false); } else { diff --git a/drivers/iio/light/hid-sensor-prox.c b/drivers/iio/light/hid-sensor-prox.c index 45107f7537b5..cf5a0c242609 100644 --- a/drivers/iio/light/hid-sensor-prox.c +++ b/drivers/iio/light/hid-sensor-prox.c @@ -73,6 +73,7 @@ static int prox_read_raw(struct iio_dev *indio_dev, int report_id = -1; u32 address; int ret_type; + s32 min; *val = 0; *val2 = 0; @@ -81,8 +82,8 @@ static int prox_read_raw(struct iio_dev *indio_dev, switch (chan->scan_index) { case CHANNEL_SCAN_INDEX_PRESENCE: report_id = prox_state->prox_attr.report_id; - address = - HID_USAGE_SENSOR_HUMAN_PRESENCE; + min = prox_state->prox_attr.logical_minimum; + address = HID_USAGE_SENSOR_HUMAN_PRESENCE; break; default: report_id = -1; @@ -95,7 +96,8 @@ static int prox_read_raw(struct iio_dev *indio_dev, prox_state->common_attributes.hsdev, HID_USAGE_SENSOR_PROX, address, report_id, - SENSOR_HUB_SYNC); + SENSOR_HUB_SYNC, + min < 0); hid_sensor_power_state(&prox_state->common_attributes, false); } else { diff --git a/drivers/iio/magnetometer/hid-sensor-magn-3d.c b/drivers/iio/magnetometer/hid-sensor-magn-3d.c index d55c4885211a..f3c0d41e5a8c 100644 --- a/drivers/iio/magnetometer/hid-sensor-magn-3d.c +++ b/drivers/iio/magnetometer/hid-sensor-magn-3d.c @@ -163,21 +163,23 @@ static int magn_3d_read_raw(struct iio_dev *indio_dev, int report_id = -1; u32 address; int ret_type; + s32 min; *val = 0; *val2 = 0; switch (mask) { case IIO_CHAN_INFO_RAW: hid_sensor_power_state(&magn_state->magn_flux_attributes, true); - report_id = - magn_state->magn[chan->address].report_id; + report_id = magn_state->magn[chan->address].report_id; + min = magn_state->magn[chan->address].logical_minimum; address = magn_3d_addresses[chan->address]; if (report_id >= 0) *val = sensor_hub_input_attr_get_raw_value( magn_state->magn_flux_attributes.hsdev, HID_USAGE_SENSOR_COMPASS_3D, address, report_id, - SENSOR_HUB_SYNC); + SENSOR_HUB_SYNC, + min < 0); else { *val = 0; hid_sensor_power_state( diff --git a/drivers/iio/orientation/hid-sensor-incl-3d.c b/drivers/iio/orientation/hid-sensor-incl-3d.c index 1e5451d1ff88..bdc5e4554ee4 100644 --- a/drivers/iio/orientation/hid-sensor-incl-3d.c +++ b/drivers/iio/orientation/hid-sensor-incl-3d.c @@ -111,21 +111,23 @@ static int incl_3d_read_raw(struct iio_dev *indio_dev, int report_id = -1; u32 address; int ret_type; + s32 min; *val = 0; *val2 = 0; switch (mask) { case IIO_CHAN_INFO_RAW: hid_sensor_power_state(&incl_state->common_attributes, true); - report_id = - incl_state->incl[chan->scan_index].report_id; + report_id = incl_state->incl[chan->scan_index].report_id; + min = incl_state->incl[chan->scan_index].logical_minimum; address = incl_3d_addresses[chan->scan_index]; if (report_id >= 0) *val = sensor_hub_input_attr_get_raw_value( incl_state->common_attributes.hsdev, HID_USAGE_SENSOR_INCLINOMETER_3D, address, report_id, - SENSOR_HUB_SYNC); + SENSOR_HUB_SYNC, + min < 0); else { hid_sensor_power_state(&incl_state->common_attributes, false); diff --git a/drivers/iio/pressure/hid-sensor-press.c b/drivers/iio/pressure/hid-sensor-press.c index 4c437918f1d2..d7b1c00ceb4d 100644 --- a/drivers/iio/pressure/hid-sensor-press.c +++ b/drivers/iio/pressure/hid-sensor-press.c @@ -77,6 +77,7 @@ static int press_read_raw(struct iio_dev *indio_dev, int report_id = -1; u32 address; int ret_type; + s32 min; *val = 0; *val2 = 0; @@ -85,8 +86,8 @@ static int press_read_raw(struct iio_dev *indio_dev, switch (chan->scan_index) { case CHANNEL_SCAN_INDEX_PRESSURE: report_id = press_state->press_attr.report_id; - address = - HID_USAGE_SENSOR_ATMOSPHERIC_PRESSURE; + min = press_state->press_attr.logical_minimum; + address = HID_USAGE_SENSOR_ATMOSPHERIC_PRESSURE; break; default: report_id = -1; @@ -99,7 +100,8 @@ static int press_read_raw(struct iio_dev *indio_dev, press_state->common_attributes.hsdev, HID_USAGE_SENSOR_PRESSURE, address, report_id, - SENSOR_HUB_SYNC); + SENSOR_HUB_SYNC, + min < 0); hid_sensor_power_state(&press_state->common_attributes, false); } else { diff --git a/drivers/iio/temperature/hid-sensor-temperature.c b/drivers/iio/temperature/hid-sensor-temperature.c index beaf6fd3e337..b592fc4f007e 100644 --- a/drivers/iio/temperature/hid-sensor-temperature.c +++ b/drivers/iio/temperature/hid-sensor-temperature.c @@ -76,7 +76,8 @@ static int temperature_read_raw(struct iio_dev *indio_dev, HID_USAGE_SENSOR_TEMPERATURE, HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE, temp_st->temperature_attr.report_id, - SENSOR_HUB_SYNC); + SENSOR_HUB_SYNC, + temp_st->temperature_attr.logical_minimum < 0); hid_sensor_power_state( &temp_st->common_attributes, false); diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c index 2751dba850c6..3e1abb455472 100644 --- a/drivers/rtc/rtc-hid-sensor-time.c +++ b/drivers/rtc/rtc-hid-sensor-time.c @@ -213,7 +213,7 @@ static int hid_rtc_read_time(struct device *dev, struct rtc_time *tm) /* get a report with all values through requesting one value */ sensor_hub_input_attr_get_raw_value(time_state->common_attributes.hsdev, HID_USAGE_SENSOR_TIME, hid_time_addresses[0], - time_state->info[0].report_id, SENSOR_HUB_SYNC); + time_state->info[0].report_id, SENSOR_HUB_SYNC, false); /* wait for all values (event) */ ret = wait_for_completion_killable_timeout( &time_state->comp_last_time, HZ*6); diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h index 331dc377c275..dc12f5c4b076 100644 --- a/include/linux/hid-sensor-hub.h +++ b/include/linux/hid-sensor-hub.h @@ -177,6 +177,7 @@ int sensor_hub_input_get_attribute_info(struct hid_sensor_hub_device *hsdev, * @attr_usage_id: Attribute usage id as per spec * @report_id: Report id to look for * @flag: Synchronous or asynchronous read +* @is_signed: If true then fields < 32 bits will be sign-extended * * Issues a synchronous or asynchronous read request for an input attribute. * Returns data upto 32 bits. @@ -190,7 +191,8 @@ enum sensor_hub_read_flags { int sensor_hub_input_attr_get_raw_value(struct hid_sensor_hub_device *hsdev, u32 usage_id, u32 attr_usage_id, u32 report_id, - enum sensor_hub_read_flags flag + enum sensor_hub_read_flags flag, + bool is_signed ); /** -- cgit v1.2.3-71-gd317 From b34087157dd76e8d96e5e52808134a791ac61e57 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 21 Nov 2018 16:00:50 +0000 Subject: dma-direct: Make DIRECT_MAPPING_ERROR viable for SWIOTLB With the overflow buffer removed, we no longer have a unique address which is guaranteed not to be a valid DMA target to use as an error token. The DIRECT_MAPPING_ERROR value of 0 tries to at least represent an unlikely DMA target, but unfortunately there are already SWIOTLB users with DMA-able memory at physical address 0 which now gets falsely treated as a mapping failure and leads to all manner of misbehaviour. The best we can do to mitigate that is flip DIRECT_MAPPING_ERROR to the other commonly-used error value of all-bits-set, since the last single byte of memory is by far the least-likely-valid DMA target. Fixes: dff8d6c1ed58 ("swiotlb: remove the overflow buffer") Reported-by: John Stultz Tested-by: John Stultz Acked-by: Konrad Rzeszutek Wilk Signed-off-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/dma-direct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index bd73e7a91410..9e66bfe369aa 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -5,7 +5,7 @@ #include #include -#define DIRECT_MAPPING_ERROR 0 +#define DIRECT_MAPPING_ERROR (~(dma_addr_t)0) #ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA #include -- cgit v1.2.3-71-gd317 From 86de5921a3d5dd246df661e09bdd0a6131b39ae3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Nov 2018 05:53:59 -0800 Subject: tcp: defer SACK compression after DupThresh Jean-Louis reported a TCP regression and bisected to recent SACK compression. After a loss episode (receiver not able to keep up and dropping packets because its backlog is full), linux TCP stack is sending a single SACK (DUPACK). Sender waits a full RTO timer before recovering losses. While RFC 6675 says in section 5, "Algorithm Details", (2) If DupAcks < DupThresh but IsLost (HighACK + 1) returns true -- indicating at least three segments have arrived above the current cumulative acknowledgment point, which is taken to indicate loss -- go to step (4). ... (4) Invoke fast retransmit and enter loss recovery as follows: there are old TCP stacks not implementing this strategy, and still counting the dupacks before starting fast retransmit. While these stacks probably perform poorly when receivers implement LRO/GRO, we should be a little more gentle to them. This patch makes sure we do not enable SACK compression unless 3 dupacks have been sent since last rcv_nxt update. Ideally we should even rearm the timer to send one or two more DUPACK if no more packets are coming, but that will be work aiming for linux-4.21. Many thanks to Jean-Louis for bisecting the issue, providing packet captures and testing this patch. Fixes: 5d9f4262b7ea ("tcp: add SACK compression") Reported-by: Jean-Louis Dupond Tested-by: Jean-Louis Dupond Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp_input.c | 14 ++++++++++++-- net/ipv4/tcp_output.c | 6 +++--- net/ipv4/tcp_timer.c | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 8ed77bb4ed86..a9b0280687d5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -196,6 +196,7 @@ struct tcp_sock { u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ + u32 compressed_ack_rcv_nxt; u32 tsoffset; /* timestamp offset */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e695584bb33f..1e37c1388189 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4268,7 +4268,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - if (tp->compressed_ack) + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) tcp_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; @@ -5189,7 +5189,17 @@ send_now: if (!tcp_is_sack(tp) || tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) goto send_now; - tp->compressed_ack++; + + if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { + tp->compressed_ack_rcv_nxt = tp->rcv_nxt; + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack - TCP_FASTRETRANS_THRESH); + tp->compressed_ack = 0; + } + + if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) + goto send_now; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9c34b97d365d..3f510cad0b3e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -180,10 +180,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, { struct tcp_sock *tp = tcp_sk(sk); - if (unlikely(tp->compressed_ack)) { + if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack); - tp->compressed_ack = 0; + tp->compressed_ack - TCP_FASTRETRANS_THRESH); + tp->compressed_ack = TCP_FASTRETRANS_THRESH; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 676020663ce8..5f8b6d3cd855 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -740,7 +740,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - if (tp->compressed_ack) + if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) tcp_send_ack(sk); } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, -- cgit v1.2.3-71-gd317 From 0211dda68a4f6531923a2f72d8e8959207f59fba Mon Sep 17 00:00:00 2001 From: Tal Gilboa Date: Wed, 21 Nov 2018 16:28:23 +0200 Subject: net/dim: Update DIM start sample after each DIM iteration On every iteration of net_dim, the algorithm may choose to check for the system state by comparing current data sample with previous data sample. After each of these comparison, regardless of the action taken, the sample used as baseline is needed to be updated. This patch fixes a bug that causes DIM to take wrong decisions, due to never updating the baseline sample for comparison between iterations. This way, DIM always compares current sample with zeros. Although this is a functional fix, it also improves and stabilizes performance as the algorithm works properly now. Performance: Tested single UDP TX stream with pktgen: samples/pktgen/pktgen_sample03_burst_single_flow.sh -i p4p2 -d 1.1.1.1 -m 24:8a:07:88:26:8b -f 3 -b 128 ConnectX-5 100GbE packet rate improved from 15-19Mpps to 19-20Mpps. Also, toggling between profiles is less frequent with the fix. Fixes: 8115b750dbcb ("net/dim: use struct net_dim_sample as arg to net_dim") Signed-off-by: Tal Gilboa Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/net_dim.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h index c79e859408e6..fd458389f7d1 100644 --- a/include/linux/net_dim.h +++ b/include/linux/net_dim.h @@ -406,6 +406,8 @@ static inline void net_dim(struct net_dim *dim, } /* fall through */ case NET_DIM_START_MEASURE: + net_dim_sample(end_sample.event_ctr, end_sample.pkt_ctr, end_sample.byte_ctr, + &dim->start_sample); dim->state = NET_DIM_MEASURE_IN_PROGRESS; break; case NET_DIM_APPLY_NEW_PROFILE: -- cgit v1.2.3-71-gd317 From 5cd8d46ea1562be80063f53c7c6a5f40224de623 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 20 Nov 2018 13:00:18 -0500 Subject: packet: copy user buffers before orphan or clone tpacket_snd sends packets with user pages linked into skb frags. It notifies that pages can be reused when the skb is released by setting skb->destructor to tpacket_destruct_skb. This can cause data corruption if the skb is orphaned (e.g., on transmit through veth) or cloned (e.g., on mirror to another psock). Create a kernel-private copy of data in these cases, same as tun/tap zerocopy transmission. Reuse that infrastructure: mark the skb as SKBTX_ZEROCOPY_FRAG, which will trigger copy in skb_orphan_frags(_rx). Unlike other zerocopy packets, do not set shinfo destructor_arg to struct ubuf_info. tpacket_destruct_skb already uses that ptr to notify when the original skb is released and a timestamp is recorded. Do not change this timestamp behavior. The ubuf_info->callback is not needed anyway, as no zerocopy notification is expected. Mark destructor_arg as not-a-uarg by setting the lower bit to 1. The resulting value is not a valid ubuf_info pointer, nor a valid tpacket_snd frame address. Add skb_zcopy_.._nouarg helpers for this. The fix relies on features introduced in commit 52267790ef52 ("sock: add MSG_ZEROCOPY"), so can be backported as is only to 4.14. Tested with from `./in_netns.sh ./txring_overwrite` from http://github.com/wdebruij/kerneltools/tests Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap") Reported-by: Anand H. Krishnan Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 +++++++++++++++++- net/packet/af_packet.c | 4 ++-- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0ba687454267..0d1b2c3f127b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1326,6 +1326,22 @@ static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) } } +static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val) +{ + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL); + skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; +} + +static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb) +{ + return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL; +} + +static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) +{ + return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL); +} + /* Release a reference on a zerocopy structure */ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) { @@ -1335,7 +1351,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) if (uarg->callback == sock_zerocopy_callback) { uarg->zerocopy = uarg->zerocopy && zerocopy; sock_zerocopy_put(uarg); - } else { + } else if (!skb_zcopy_is_nouarg(skb)) { uarg->callback(uarg, zerocopy); } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ec3095f13aae..a74650e98f42 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2394,7 +2394,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) void *ph; __u32 ts; - ph = skb_shinfo(skb)->destructor_arg; + ph = skb_zcopy_get_nouarg(skb); packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); @@ -2461,7 +2461,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->mark = po->sk.sk_mark; skb->tstamp = sockc->transmit_time; sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); - skb_shinfo(skb)->destructor_arg = ph.raw; + skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); skb_reset_network_header(skb); -- cgit v1.2.3-71-gd317 From 89259088c1b7fecb43e8e245dc931909132a4e03 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 17 Nov 2018 11:32:29 +0100 Subject: netfilter: nfnetlink_cttimeout: fetch timeouts for udplite and gre, too syzbot was able to trigger the WARN in cttimeout_default_get() by passing UDPLITE as l4protocol. Alias UDPLITE to UDP, both use same timeout values. Furthermore, also fetch GRE timeouts. GRE is a bit more complicated, as it still can be a module and its netns_proto_gre struct layout isn't visible outside of the gre module. Can't move timeouts around, it appears conntrack sysctl unregister assumes net_generic() returns nf_proto_net, so we get crash. Expose layout of netns_proto_gre instead. A followup nf-next patch could make gre tracker be built-in as well if needed, its not that large. Last, make the WARN() mention the missing protocol value in case anything else is missing. Reported-by: syzbot+2fae8fa157dd92618cae@syzkaller.appspotmail.com Fixes: 8866df9264a3 ("netfilter: nfnetlink_cttimeout: pass default timeout policy to obj_to_nlattr") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_proto_gre.h | 13 +++++++++++++ net/netfilter/nf_conntrack_proto_gre.c | 14 ++------------ net/netfilter/nfnetlink_cttimeout.c | 15 +++++++++++++-- 3 files changed, 28 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h index b8d95564bd53..14edb795ab43 100644 --- a/include/linux/netfilter/nf_conntrack_proto_gre.h +++ b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -21,6 +21,19 @@ struct nf_ct_gre_keymap { struct nf_conntrack_tuple tuple; }; +enum grep_conntrack { + GRE_CT_UNREPLIED, + GRE_CT_REPLIED, + GRE_CT_MAX +}; + +struct netns_proto_gre { + struct nf_proto_net nf; + rwlock_t keymap_lock; + struct list_head keymap_list; + unsigned int gre_timeouts[GRE_CT_MAX]; +}; + /* add new tuple->key_reply pair to keymap */ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, struct nf_conntrack_tuple *t); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9b48dc8b4b88..2a5e56c6d8d9 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -43,24 +43,12 @@ #include #include -enum grep_conntrack { - GRE_CT_UNREPLIED, - GRE_CT_REPLIED, - GRE_CT_MAX -}; - static const unsigned int gre_timeouts[GRE_CT_MAX] = { [GRE_CT_UNREPLIED] = 30*HZ, [GRE_CT_REPLIED] = 180*HZ, }; static unsigned int proto_gre_net_id __read_mostly; -struct netns_proto_gre { - struct nf_proto_net nf; - rwlock_t keymap_lock; - struct list_head keymap_list; - unsigned int gre_timeouts[GRE_CT_MAX]; -}; static inline struct netns_proto_gre *gre_pernet(struct net *net) { @@ -402,6 +390,8 @@ static int __init nf_ct_proto_gre_init(void) { int ret; + BUILD_BUG_ON(offsetof(struct netns_proto_gre, nf) != 0); + ret = register_pernet_subsys(&proto_gre_net_ops); if (ret < 0) goto out_pernet; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index a518eb162344..109b0d27345a 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -455,7 +455,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, case IPPROTO_TCP: timeouts = nf_tcp_pernet(net)->timeouts; break; - case IPPROTO_UDP: + case IPPROTO_UDP: /* fallthrough */ + case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(net)->timeouts; break; case IPPROTO_DCCP: @@ -469,13 +470,23 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, case IPPROTO_SCTP: #ifdef CONFIG_NF_CT_PROTO_SCTP timeouts = nf_sctp_pernet(net)->timeouts; +#endif + break; + case IPPROTO_GRE: +#ifdef CONFIG_NF_CT_PROTO_GRE + if (l4proto->net_id) { + struct netns_proto_gre *net_gre; + + net_gre = net_generic(net, *l4proto->net_id); + timeouts = net_gre->gre_timeouts; + } #endif break; case 255: timeouts = &nf_generic_pernet(net)->timeout; break; default: - WARN_ON_ONCE(1); + WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); break; } -- cgit v1.2.3-71-gd317 From 786a9ab1330169f2602238822b4df5d5c4c98f6c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 21 Nov 2018 10:35:17 +0100 Subject: gpio: davinci: restore a way to manually specify the GPIO base Commit 587f7a694f01 ("gpio: davinci: Use dev name for label and automatic base selection") broke the network support in legacy boot mode for da850-evm since we can no longer request the MDIO clock GPIO. Other boards may be broken too, which I haven't tested. The problem is in the fact that most board files still use the legacy GPIO API where lines are requested by numbers rather than descriptors. While this should be fixed eventually, in order to unbreak the board for now - provide a way to manually specify the GPIO base in platform data. Fixes: 587f7a694f01 ("gpio: davinci: Use dev name for label and automatic base selection") Cc: stable@vger.kernel.org Signed-off-by: Bartosz Golaszewski Acked-by: Linus Walleij Signed-off-by: Sekhar Nori --- drivers/gpio/gpio-davinci.c | 2 +- include/linux/platform_data/gpio-davinci.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c index 5c1564fcc24e..bdb29e51b417 100644 --- a/drivers/gpio/gpio-davinci.c +++ b/drivers/gpio/gpio-davinci.c @@ -258,7 +258,7 @@ static int davinci_gpio_probe(struct platform_device *pdev) chips->chip.set = davinci_gpio_set; chips->chip.ngpio = ngpio; - chips->chip.base = -1; + chips->chip.base = pdata->no_auto_base ? pdata->base : -1; #ifdef CONFIG_OF_GPIO chips->chip.of_gpio_n_cells = 2; diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h index f92a47e18034..a93841bfb9f7 100644 --- a/include/linux/platform_data/gpio-davinci.h +++ b/include/linux/platform_data/gpio-davinci.h @@ -17,6 +17,8 @@ #define __DAVINCI_GPIO_PLATFORM_H struct davinci_gpio_platform_data { + bool no_auto_base; + u32 base; u32 ngpio; u32 gpio_unbanked; }; -- cgit v1.2.3-71-gd317 From 8114865ff82e200b383e46821c25cb0625b842b5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 18 Nov 2018 17:10:15 -0500 Subject: function_graph: Create function_graph_enter() to consolidate architecture code Currently all the architectures do basically the same thing in preparing the function graph tracer on entry to a function. This code can be pulled into a generic location and then this will allow the function graph tracer to be fixed, as well as extended. Create a new function graph helper function_graph_enter() that will call the hook function (ftrace_graph_entry) and the shadow stack operation (ftrace_push_return_trace), and remove the need of the architecture code to manage the shadow stack. This is needed to prepare for a fix of a design bug on how the curr_ret_stack is used. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 +++ kernel/trace/trace_functions_graph.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a397907e8d72..5717e8f81c59 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -779,6 +779,9 @@ extern void return_to_handler(void); extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long frame_pointer, unsigned long *retp); +extern int +function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp); unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 169b3c44ee97..28f2602435d0 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -182,6 +182,22 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, return 0; } +int function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp) +{ + struct ftrace_graph_ent trace; + + trace.func = func; + trace.depth = current->curr_ret_stack + 1; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + return -EBUSY; + + return ftrace_push_return_trace(ret, func, &trace.depth, + frame_pointer, retp); +} + /* Retrieve a function return address to the trace stack on thread info.*/ static void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, -- cgit v1.2.3-71-gd317 From e2c95a61656d29ceaac97b6a975c8a1f26e26f15 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 26 Nov 2018 14:05:38 +0100 Subject: bpf, ppc64: generalize fetching subprog into bpf_jit_get_func_addr Make fetching of the BPF call address from ppc64 JIT generic. ppc64 was using a slightly different variant rather than through the insns' imm field encoding as the target address would not fit into that space. Therefore, the target subprog number was encoded into the insns' offset and fetched through fp->aux->func[off]->bpf_func instead. Given there are other JITs with this issue and the mechanism of fetching the address is JIT-generic, move it into the core as a helper instead. On the JIT side, we get information on whether the retrieved address is a fixed one, that is, not changing through JIT passes, or a dynamic one. For the former, JITs can optimize their imm emission because this doesn't change jump offsets throughout JIT process. Signed-off-by: Daniel Borkmann Reviewed-by: Sandipan Das Tested-by: Sandipan Das Signed-off-by: Alexei Starovoitov --- arch/powerpc/net/bpf_jit_comp64.c | 57 ++++++++++++++++++++++++++------------- include/linux/filter.h | 4 +++ kernel/bpf/core.c | 34 +++++++++++++++++++++++ 3 files changed, 76 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 50b129785aee..17482f5de3e2 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -166,7 +166,33 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) PPC_BLR(); } -static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func) +static void bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, + u64 func) +{ +#ifdef PPC64_ELF_ABI_v1 + /* func points to the function descriptor */ + PPC_LI64(b2p[TMP_REG_2], func); + /* Load actual entry point from function descriptor */ + PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0); + /* ... and move it to LR */ + PPC_MTLR(b2p[TMP_REG_1]); + /* + * Load TOC from function descriptor at offset 8. + * We can clobber r2 since we get called through a + * function pointer (so caller will save/restore r2) + * and since we don't use a TOC ourself. + */ + PPC_BPF_LL(2, b2p[TMP_REG_2], 8); +#else + /* We can clobber r12 */ + PPC_FUNC_ADDR(12, func); + PPC_MTLR(12); +#endif + PPC_BLRL(); +} + +static void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, + u64 func) { unsigned int i, ctx_idx = ctx->idx; @@ -273,7 +299,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; - int i; + int i, ret; /* Start of epilogue code - will only be valid 2nd pass onwards */ u32 exit_addr = addrs[flen]; @@ -284,8 +310,9 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 src_reg = b2p[insn[i].src_reg]; s16 off = insn[i].off; s32 imm = insn[i].imm; + bool func_addr_fixed; + u64 func_addr; u64 imm64; - u8 *func; u32 true_cond; u32 tmp_idx; @@ -711,23 +738,15 @@ emit_clear: case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - /* bpf function call */ - if (insn[i].src_reg == BPF_PSEUDO_CALL) - if (!extra_pass) - func = NULL; - else if (fp->aux->func && off < fp->aux->func_cnt) - /* use the subprog id from the off - * field to lookup the callee address - */ - func = (u8 *) fp->aux->func[off]->bpf_func; - else - return -EINVAL; - /* kernel helper call */ - else - func = (u8 *) __bpf_call_base + imm; - - bpf_jit_emit_func_call(image, ctx, (u64)func); + ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, + &func_addr, &func_addr_fixed); + if (ret < 0) + return ret; + if (func_addr_fixed) + bpf_jit_emit_func_call_hlp(image, ctx, func_addr); + else + bpf_jit_emit_func_call_rel(image, ctx, func_addr); /* move return value from r3 to BPF_REG_0 */ PPC_MR(b2p[BPF_REG_0], 3); break; diff --git a/include/linux/filter.h b/include/linux/filter.h index de629b706d1d..448dcc448f1f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -866,6 +866,10 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed); + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a796e0799ec..b1a3545d0ec8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -672,6 +672,40 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed) +{ + s16 off = insn->off; + s32 imm = insn->imm; + u8 *addr; + + *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; + if (!*func_addr_fixed) { + /* Place-holder address till the last pass has collected + * all addresses for JITed subprograms in which case we + * can pick them up from prog->aux. + */ + if (!extra_pass) + addr = NULL; + else if (prog->aux->func && + off >= 0 && off < prog->aux->func_cnt) + addr = (u8 *)prog->aux->func[off]->bpf_func; + else + return -EINVAL; + } else { + /* Address of a BPF helper call. Since part of the core + * kernel, it's always at a fixed location. __bpf_call_base + * and the helper with imm relative to it are both in core + * kernel. + */ + addr = (u8 *)__bpf_call_base + imm; + } + + *func_addr = (unsigned long)addr; + return 0; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) -- cgit v1.2.3-71-gd317 From d125f3f866df88da5a85df00291f88f0baa89f7c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 19 Nov 2018 07:40:39 -0500 Subject: function_graph: Make ftrace_push_return_trace() static As all architectures now call function_graph_enter() to do the entry work, no architecture should ever call ftrace_push_return_trace(). Make it static. This is needed to prepare for a fix of a design bug on how the curr_ret_stack is used. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 --- kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5717e8f81c59..dd16e8218db3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -776,9 +776,6 @@ struct ftrace_ret_stack { */ extern void return_to_handler(void); -extern int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer, unsigned long *retp); extern int function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 28f2602435d0..88ca787a1cdc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -118,7 +118,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); /* Add a function return address to the trace stack on thread info.*/ -int +static int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long frame_pointer, unsigned long *retp) { -- cgit v1.2.3-71-gd317 From 39eb456dacb543de90d3bc6a8e0ac5cf51ac475e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 19 Nov 2018 08:07:12 -0500 Subject: function_graph: Use new curr_ret_depth to manage depth instead of curr_ret_stack Currently, the depth of the ret_stack is determined by curr_ret_stack index. The issue is that there's a race between setting of the curr_ret_stack and calling of the callback attached to the return of the function. Commit 03274a3ffb44 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") moved the calling of the callback to after the setting of the curr_ret_stack, even stating that it was safe to do so, when in fact, it was the reason there was a barrier() there (yes, I should have commented that barrier()). Not only does the curr_ret_stack keep track of the current call graph depth, it also keeps the ret_stack content from being overwritten by new data. The function profiler, uses the "subtime" variable of ret_stack structure and by moving the curr_ret_stack, it allows for interrupts to use the same structure it was using, corrupting the data, and breaking the profiler. To fix this, there needs to be two variables to handle the call stack depth and the pointer to where the ret_stack is being used, as they need to change at two different locations. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/sched.h | 1 + kernel/trace/ftrace.c | 3 +++ kernel/trace/trace_functions_graph.c | 21 +++++++++++++-------- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a51c13c2b1a0..d6183a55e8eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1116,6 +1116,7 @@ struct task_struct { #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; + int curr_ret_depth; /* Stack of return addresses for return function tracing: */ struct ftrace_ret_stack *ret_stack; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f536f601bd46..48513954713c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6814,6 +6814,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->curr_ret_stack = -1; + t->curr_ret_depth = -1; /* Make sure the tasks see the -1 first: */ smp_wmb(); t->ret_stack = ret_stack_list[start++]; @@ -7038,6 +7039,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { t->curr_ret_stack = -1; + t->curr_ret_depth = -1; /* * The idle task has no parent, it either has its own * stack or no stack at all. @@ -7068,6 +7070,7 @@ void ftrace_graph_init_task(struct task_struct *t) /* Make sure we do not use the parent ret_stack */ t->ret_stack = NULL; t->curr_ret_stack = -1; + t->curr_ret_depth = -1; if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 88ca787a1cdc..02d4081a7f5a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, /* Add a function return address to the trace stack on thread info.*/ static int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, +ftrace_push_return_trace(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp) { unsigned long long calltime; @@ -177,8 +177,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR current->ret_stack[index].retp = retp; #endif - *depth = current->curr_ret_stack; - return 0; } @@ -188,14 +186,20 @@ int function_graph_enter(unsigned long ret, unsigned long func, struct ftrace_graph_ent trace; trace.func = func; - trace.depth = current->curr_ret_stack + 1; + trace.depth = ++current->curr_ret_depth; /* Only trace if the calling function expects to */ if (!ftrace_graph_entry(&trace)) - return -EBUSY; + goto out; - return ftrace_push_return_trace(ret, func, &trace.depth, - frame_pointer, retp); + if (ftrace_push_return_trace(ret, func, + frame_pointer, retp)) + goto out; + + return 0; + out: + current->curr_ret_depth--; + return -EBUSY; } /* Retrieve a function return address to the trace stack on thread info.*/ @@ -257,7 +261,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = index; + trace->depth = current->curr_ret_depth; } /* @@ -273,6 +277,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) trace.rettime = trace_clock_local(); barrier(); current->curr_ret_stack--; + current->curr_ret_depth--; /* * The curr_ret_stack can be less than -1 only if it was * filtered out and it's about to return from the function. -- cgit v1.2.3-71-gd317 From 321a874a7ef85655e93b3206d0f36b4a6097f948 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:38 +0100 Subject: sched/smt: Expose sched_smt_present static key Make the scheduler's 'sched_smt_present' static key globaly available, so it can be used in the x86 speculation control code. Provide a query function and a stub for the CONFIG_SMP=n case. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.430168326@linutronix.de --- include/linux/sched/smt.h | 18 ++++++++++++++++++ kernel/sched/sched.h | 4 +--- 2 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 include/linux/sched/smt.h (limited to 'include/linux') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h new file mode 100644 index 000000000000..c9e0be514110 --- /dev/null +++ b/include/linux/sched/smt.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SMT_H +#define _LINUX_SCHED_SMT_H + +#include + +#ifdef CONFIG_SCHED_SMT +extern struct static_key_false sched_smt_present; + +static __always_inline bool sched_smt_active(void) +{ + return static_branch_likely(&sched_smt_present); +} +#else +static inline bool sched_smt_active(void) { return false; } +#endif + +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 618577fc9aa8..4e524ab589c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -936,9 +937,6 @@ static inline int cpu_of(struct rq *rq) #ifdef CONFIG_SCHED_SMT - -extern struct static_key_false sched_smt_present; - extern void __update_idle_core(struct rq *rq); static inline void update_idle_core(struct rq *rq) -- cgit v1.2.3-71-gd317 From a74cfffb03b73d41e08f84c2e5c87dec0ce3db9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:39 +0100 Subject: x86/speculation: Rework SMT state change arch_smt_update() is only called when the sysfs SMT control knob is changed. This means that when SMT is enabled in the sysfs control knob the system is considered to have SMT active even if all siblings are offline. To allow finegrained control of the speculation mitigations, the actual SMT state is more interesting than the fact that siblings could be enabled. Rework the code, so arch_smt_update() is invoked from each individual CPU hotplug function, and simplify the update function while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.521974984@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 11 +++++------ include/linux/sched/smt.h | 2 ++ kernel/cpu.c | 15 +++++++++------ 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a723af0c4400..5625b323ff32 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -344,16 +345,14 @@ void arch_smt_update(void) return; mutex_lock(&spec_ctrl_mutex); - mask = x86_spec_ctrl_base; - if (cpu_smt_control == CPU_SMT_ENABLED) + + mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; + if (sched_smt_active()) mask |= SPEC_CTRL_STIBP; - else - mask &= ~SPEC_CTRL_STIBP; if (mask != x86_spec_ctrl_base) { pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", - cpu_smt_control == CPU_SMT_ENABLED ? - "Enabling" : "Disabling"); + mask & SPEC_CTRL_STIBP ? "Enabling" : "Disabling"); x86_spec_ctrl_base = mask; on_each_cpu(update_stibp_msr, NULL, 1); } diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index c9e0be514110..59d3736c454c 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -15,4 +15,6 @@ static __always_inline bool sched_smt_active(void) static inline bool sched_smt_active(void) { return false; } #endif +void arch_smt_update(void); + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 3c7f3b4c453c..91d5c38eb7e5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -367,6 +368,12 @@ static void lockdep_release_cpus_lock(void) #endif /* CONFIG_HOTPLUG_CPU */ +/* + * Architectures that need SMT-specific errata handling during SMT hotplug + * should override this. + */ +void __weak arch_smt_update(void) { } + #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; EXPORT_SYMBOL_GPL(cpu_smt_control); @@ -1011,6 +1018,7 @@ out: * concurrent CPU hotplug via cpu_add_remove_lock. */ lockup_detector_cleanup(); + arch_smt_update(); return ret; } @@ -1139,6 +1147,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) ret = cpuhp_up_callbacks(cpu, st, target); out: cpus_write_unlock(); + arch_smt_update(); return ret; } @@ -2055,12 +2064,6 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } -/* - * Architectures that need SMT-specific errata handling during SMT hotplug - * should override this. - */ -void __weak arch_smt_update(void) { }; - static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; -- cgit v1.2.3-71-gd317 From 46f7ecb1e7359f183f5bbd1e08b90e10e52164f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:50 +0100 Subject: ptrace: Remove unused ptrace_may_access_sched() and MODE_IBRS The IBPB control code in x86 removed the usage. Remove the functionality which was introduced for this. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.559149393@linutronix.de --- include/linux/ptrace.h | 17 ----------------- kernel/ptrace.c | 10 ---------- 2 files changed, 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 6c2ffed907f5..de20ede2c5c8 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -64,15 +64,12 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_NOAUDIT 0x04 #define PTRACE_MODE_FSCREDS 0x08 #define PTRACE_MODE_REALCREDS 0x10 -#define PTRACE_MODE_SCHED 0x20 -#define PTRACE_MODE_IBPB 0x40 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) -#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB) /** * ptrace_may_access - check whether the caller is permitted to access @@ -90,20 +87,6 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); -/** - * ptrace_may_access - check whether the caller is permitted to access - * a target task. - * @task: target task - * @mode: selects type of access and caller credentials - * - * Returns true on success, false on denial. - * - * Similar to ptrace_may_access(). Only to be called from context switch - * code. Does not call into audit and the regular LSM hooks due to locking - * constraints. - */ -extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode); - static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 80b34dffdfb9..c2cee9db5204 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -261,9 +261,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { - if (mode & PTRACE_MODE_SCHED) - return false; - if (mode & PTRACE_MODE_NOAUDIT) return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); else @@ -331,16 +328,9 @@ ok: !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; - if (mode & PTRACE_MODE_SCHED) - return 0; return security_ptrace_access_check(task, mode); } -bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode) -{ - return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED); -} - bool ptrace_may_access(struct task_struct *task, unsigned int mode) { int err; -- cgit v1.2.3-71-gd317 From 9137bb27e60e554dab694eafa4cca241fa3a694f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Nov 2018 19:33:53 +0100 Subject: x86/speculation: Add prctl() control for indirect branch speculation Add the PR_SPEC_INDIRECT_BRANCH option for the PR_GET_SPECULATION_CTRL and PR_SET_SPECULATION_CTRL prctls to allow fine grained per task control of indirect branch speculation via STIBP and IBPB. Invocations: Check indirect branch speculation status with - prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0); Enable indirect branch speculation with - prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0); Disable indirect branch speculation with - prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0); Force disable indirect branch speculation with - prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0); See Documentation/userspace-api/spec_ctrl.rst. Signed-off-by: Tim Chen Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.866780996@linutronix.de --- Documentation/userspace-api/spec_ctrl.rst | 9 +++++ arch/x86/include/asm/nospec-branch.h | 1 + arch/x86/kernel/cpu/bugs.c | 67 +++++++++++++++++++++++++++++++ arch/x86/kernel/process.c | 5 +++ include/linux/sched.h | 9 +++++ include/uapi/linux/prctl.h | 1 + tools/include/uapi/linux/prctl.h | 1 + 7 files changed, 93 insertions(+) (limited to 'include/linux') diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst index 32f3d55c54b7..c4dbe6f7cdae 100644 --- a/Documentation/userspace-api/spec_ctrl.rst +++ b/Documentation/userspace-api/spec_ctrl.rst @@ -92,3 +92,12 @@ Speculation misfeature controls * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0); + +- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes + (Mitigate Spectre V2 style attacks against user processes) + + Invocations: + * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0); diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index d4d35baf0430..2adbe7b047fa 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -232,6 +232,7 @@ enum spectre_v2_mitigation { enum spectre_v2_user_mitigation { SPECTRE_V2_USER_NONE, SPECTRE_V2_USER_STRICT, + SPECTRE_V2_USER_PRCTL, }; /* The Speculative Store Bypass disable variants */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9cab538e10f1..74359fff87fd 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -566,6 +566,8 @@ void arch_smt_update(void) case SPECTRE_V2_USER_STRICT: update_stibp_strict(); break; + case SPECTRE_V2_USER_PRCTL: + break; } mutex_unlock(&spec_ctrl_mutex); @@ -752,12 +754,50 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) return 0; } +static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + switch (ctrl) { + case PR_SPEC_ENABLE: + if (spectre_v2_user == SPECTRE_V2_USER_NONE) + return 0; + /* + * Indirect branch speculation is always disabled in strict + * mode. + */ + if (spectre_v2_user == SPECTRE_V2_USER_STRICT) + return -EPERM; + task_clear_spec_ib_disable(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE: + case PR_SPEC_FORCE_DISABLE: + /* + * Indirect branch speculation is always allowed when + * mitigation is force disabled. + */ + if (spectre_v2_user == SPECTRE_V2_USER_NONE) + return -EPERM; + if (spectre_v2_user == SPECTRE_V2_USER_STRICT) + return 0; + task_set_spec_ib_disable(task); + if (ctrl == PR_SPEC_FORCE_DISABLE) + task_set_spec_ib_force_disable(task); + task_update_spec_tif(task); + break; + default: + return -ERANGE; + } + return 0; +} + int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, unsigned long ctrl) { switch (which) { case PR_SPEC_STORE_BYPASS: return ssb_prctl_set(task, ctrl); + case PR_SPEC_INDIRECT_BRANCH: + return ib_prctl_set(task, ctrl); default: return -ENODEV; } @@ -790,11 +830,34 @@ static int ssb_prctl_get(struct task_struct *task) } } +static int ib_prctl_get(struct task_struct *task) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return PR_SPEC_NOT_AFFECTED; + + switch (spectre_v2_user) { + case SPECTRE_V2_USER_NONE: + return PR_SPEC_ENABLE; + case SPECTRE_V2_USER_PRCTL: + if (task_spec_ib_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ib_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + case SPECTRE_V2_USER_STRICT: + return PR_SPEC_DISABLE; + default: + return PR_SPEC_NOT_AFFECTED; + } +} + int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) { switch (which) { case PR_SPEC_STORE_BYPASS: return ssb_prctl_get(task); + case PR_SPEC_INDIRECT_BRANCH: + return ib_prctl_get(task); default: return -ENODEV; } @@ -974,6 +1037,8 @@ static char *stibp_state(void) return ", STIBP: disabled"; case SPECTRE_V2_USER_STRICT: return ", STIBP: forced"; + case SPECTRE_V2_USER_PRCTL: + return ""; } return ""; } @@ -986,6 +1051,8 @@ static char *ibpb_state(void) return ", IBPB: disabled"; case SPECTRE_V2_USER_STRICT: return ", IBPB: always-on"; + case SPECTRE_V2_USER_PRCTL: + return ""; } } return ""; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index afbe2eb4a1c6..7d31192296a8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -450,6 +450,11 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) set_tsk_thread_flag(tsk, TIF_SSBD); else clear_tsk_thread_flag(tsk, TIF_SSBD); + + if (task_spec_ib_disable(tsk)) + set_tsk_thread_flag(tsk, TIF_SPEC_IB); + else + clear_tsk_thread_flag(tsk, TIF_SPEC_IB); } /* Return the updated threadinfo flags*/ return task_thread_info(tsk)->flags; diff --git a/include/linux/sched.h b/include/linux/sched.h index a51c13c2b1a0..d607db5fcc6a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1453,6 +1453,8 @@ static inline bool is_percpu_thread(void) #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ +#define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ +#define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ @@ -1484,6 +1486,13 @@ TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) +TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) +TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) +TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) + +TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) +TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) + static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index c0d7ea0bf5b6..b17201edfa09 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -212,6 +212,7 @@ struct prctl_mm_map { #define PR_SET_SPECULATION_CTRL 53 /* Speculation control variants */ # define PR_SPEC_STORE_BYPASS 0 +# define PR_SPEC_INDIRECT_BRANCH 1 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */ # define PR_SPEC_NOT_AFFECTED 0 # define PR_SPEC_PRCTL (1UL << 0) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index c0d7ea0bf5b6..b17201edfa09 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -212,6 +212,7 @@ struct prctl_mm_map { #define PR_SET_SPECULATION_CTRL 53 /* Speculation control variants */ # define PR_SPEC_STORE_BYPASS 0 +# define PR_SPEC_INDIRECT_BRANCH 1 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */ # define PR_SPEC_NOT_AFFECTED 0 # define PR_SPEC_PRCTL (1UL << 0) -- cgit v1.2.3-71-gd317 From 3f2b7b9035107d6096ea438ea3d97dcf0481b6d2 Mon Sep 17 00:00:00 2001 From: "kiran.modukuri" Date: Mon, 26 Nov 2018 15:41:48 +0000 Subject: fscache: Fix race in fscache_op_complete() due to split atomic_sub & read The code in fscache_retrieval_complete is using atomic_sub followed by an atomic_read: atomic_sub(n_pages, &op->n_pages); if (atomic_read(&op->n_pages) <= 0) fscache_op_complete(&op->op, true); This causes two threads doing a decrement of n_pages to race with each other seeing the op->refcount 0 at same time - and they end up calling fscache_op_complete() in both the threads leading to an assertion failure. Fix this by using atomic_sub_return_relaxed() instead of two calls. Note that I'm using 'relaxed' rather than, say, 'release' as there aren't multiple variables that appear to need ordering across the release. The oops looks something like: FS-Cache: Assertion failed FS-Cache: 0 > 0 is false ... kernel BUG at /usr/src/linux-4.4.0/fs/fscache/operation.c:449! ... Workqueue: fscache_operation fscache_op_work_func [fscache] ... RIP: 0010:[] fscache_op_complete+0x10d/0x180 [fscache] ... Call Trace: [] cachefiles_read_copier+0x3a9/0x410 [cachefiles] [] fscache_op_work_func+0x22/0x50 [fscache] [] process_one_work+0x150/0x3f0 [] worker_thread+0x11a/0x470 [] ? __schedule+0x359/0x980 [] ? rescuer_thread+0x310/0x310 [] kthread+0xd6/0xf0 [] ? kthread_park+0x60/0x60 [] ret_from_fork+0x3f/0x70 [] ? kthread_park+0x60/0x60 This seen this in 4.4.x kernels and the same bug affects fscache in latest upstreams kernels. Fixes: 1bb4b7f98f36 ("FS-Cache: The retrieval remaining-pages counter needs to be atomic_t") Signed-off-by: Kiran Kumar Modukuri Signed-off-by: David Howells --- include/linux/fscache-cache.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 34cf0fdd7dc7..610815e3f1aa 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -196,8 +196,7 @@ static inline void fscache_enqueue_retrieval(struct fscache_retrieval *op) static inline void fscache_retrieval_complete(struct fscache_retrieval *op, int n_pages) { - atomic_sub(n_pages, &op->n_pages); - if (atomic_read(&op->n_pages) <= 0) + if (atomic_sub_return_relaxed(n_pages, &op->n_pages) <= 0) fscache_op_complete(&op->op, false); } -- cgit v1.2.3-71-gd317 From 90230968f102acbe103fbf7c03d41addfef5f153 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 29 Nov 2018 12:00:05 +0200 Subject: net: phy: sfp: correct location of SFP standards SFP standards are now available from the SNIA (Storage Networking Industry Association) website. Cc: Andrew Lunn Cc: Florian Fainelli Signed-off-by: Baruch Siach Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/sfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index d37518e89db2..d9d9de3fcf8e 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -224,7 +224,7 @@ struct sfp_eeprom_ext { * * See the SFF-8472 specification and related documents for the definition * of these structure members. This can be obtained from - * ftp://ftp.seagate.com/sff + * https://www.snia.org/technology-communities/sff/specifications */ struct sfp_eeprom_id { struct sfp_eeprom_base base; -- cgit v1.2.3-71-gd317 From 89d328f637b9904b6d4c9af73c8a608b8dd4d6f8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 1 Nov 2018 16:17:22 -0700 Subject: pstore/ram: Correctly calculate usable PRZ bytes The actual number of bytes stored in a PRZ is smaller than the bytes requested by platform data, since there is a header on each PRZ. Additionally, if ECC is enabled, there are trailing bytes used as well. Normally this mismatch doesn't matter since PRZs are circular buffers and the leading "overflow" bytes are just thrown away. However, in the case of a compressed record, this rather badly corrupts the results. This corruption was visible with "ramoops.mem_size=204800 ramoops.ecc=1". Any stored crashes would not be uncompressable (producing a pstorefs "dmesg-*.enc.z" file), and triggering errors at boot: [ 2.790759] pstore: crypto_comp_decompress failed, ret = -22! Backporting this depends on commit 70ad35db3321 ("pstore: Convert console write to use ->write_buf") Reported-by: Joel Fernandes Fixes: b0aad7a99c1d ("pstore: Add compression support to pstore") Signed-off-by: Kees Cook Reviewed-by: Joel Fernandes (Google) --- fs/pstore/ram.c | 15 ++++++--------- include/linux/pstore.h | 5 ++++- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 712960e117fe..8646fe6e916f 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -816,17 +816,14 @@ static int ramoops_probe(struct platform_device *pdev) cxt->pstore.data = cxt; /* - * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we - * have to handle dumps, we must have at least record_size buffer. And - * for ftrace, bufsize is irrelevant (if bufsize is 0, buf will be - * ZERO_SIZE_PTR). + * Since bufsize is only used for dmesg crash dumps, it + * must match the size of the dprz record (after PRZ header + * and ECC bytes have been accounted for). */ - if (cxt->console_size) - cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */ - cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize); - cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL); + cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); if (!cxt->pstore.buf) { - pr_err("cannot allocate pstore buffer\n"); + pr_err("cannot allocate pstore crash dump buffer\n"); err = -ENOMEM; goto fail_clear; } diff --git a/include/linux/pstore.h b/include/linux/pstore.h index a15bc4d48752..30fcec375a3a 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -90,7 +90,10 @@ struct pstore_record { * * @buf_lock: spinlock to serialize access to @buf * @buf: preallocated crash dump buffer - * @bufsize: size of @buf available for crash dump writes + * @bufsize: size of @buf available for crash dump bytes (must match + * smallest number of bytes available for writing to a + * backend entry, since compressed bytes don't take kindly + * to being truncated) * * @read_mutex: serializes @open, @read, @close, and @erase callbacks * @flags: bitfield of frontends the backend can accept writes for -- cgit v1.2.3-71-gd317 From 0c7a52e4d4b5c4d35b31f3c3ad32af814f1bf491 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 28 Nov 2018 03:35:23 +0000 Subject: tracepoint: Use __idx instead of idx in DO_TRACE macro to make it unique After enabling KVM event tracing, almost all of trace_kvm_exit()'s printk shows "kvm_exit: IRQ: ..." even if the actual exception_type is NOT IRQ. More specifically, trace_kvm_exit() is defined in virt/kvm/arm/trace.h by TRACE_EVENT. This slight problem may have existed after commit e6753f23d961 ("tracepoint: Make rcuidle tracepoint callers use SRCU"). There are two variables in trace_kvm_exit() and __DO_TRACE() which have the same name, *idx*. Thus the actual value of *idx* will be overwritten when tracing. Fix it by adding a simple prefix. Cc: Joel Fernandes Cc: Wang Haibin Cc: linux-trace-devel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: e6753f23d961 ("tracepoint: Make rcuidle tracepoint callers use SRCU") Reviewed-by: Joel Fernandes (Google) Signed-off-by: Zenghui Yu Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 538ba1a58f5b..e9de8ad0bad7 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -166,7 +166,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) struct tracepoint_func *it_func_ptr; \ void *it_func; \ void *__data; \ - int __maybe_unused idx = 0; \ + int __maybe_unused __idx = 0; \ \ if (!(cond)) \ return; \ @@ -182,7 +182,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * doesn't work from the idle path. \ */ \ if (rcuidle) { \ - idx = srcu_read_lock_notrace(&tracepoint_srcu); \ + __idx = srcu_read_lock_notrace(&tracepoint_srcu);\ rcu_irq_enter_irqson(); \ } \ \ @@ -198,7 +198,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) \ if (rcuidle) { \ rcu_irq_exit_irqson(); \ - srcu_read_unlock_notrace(&tracepoint_srcu, idx);\ + srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\ } \ \ preempt_enable_notrace(); \ -- cgit v1.2.3-71-gd317 From e0c274472d5d27f277af722e017525e0b33784cd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 30 Nov 2018 14:09:58 -0800 Subject: psi: make disabling/enabling easier for vendor kernels Mel Gorman reports a hackbench regression with psi that would prohibit shipping the suse kernel with it default-enabled, but he'd still like users to be able to opt in at little to no cost to others. With the current combination of CONFIG_PSI and the psi_disabled bool set from the commandline, this is a challenge. Do the following things to make it easier: 1. Add a config option CONFIG_PSI_DEFAULT_DISABLED that allows distros to enable CONFIG_PSI in their kernel but leave the feature disabled unless a user requests it at boot-time. To avoid double negatives, rename psi_disabled= to psi=. 2. Make psi_disabled a static branch to eliminate any branch costs when the feature is disabled. In terms of numbers before and after this patch, Mel says: : The following is a comparision using CONFIG_PSI=n as a baseline against : your patch and a vanilla kernel : : 4.20.0-rc4 4.20.0-rc4 4.20.0-rc4 : kconfigdisable-v1r1 vanilla psidisable-v1r1 : Amean 1 1.3100 ( 0.00%) 1.3923 ( -6.28%) 1.3427 ( -2.49%) : Amean 3 3.8860 ( 0.00%) 4.1230 * -6.10%* 3.8860 ( -0.00%) : Amean 5 6.8847 ( 0.00%) 8.0390 * -16.77%* 6.7727 ( 1.63%) : Amean 7 9.9310 ( 0.00%) 10.8367 * -9.12%* 9.9910 ( -0.60%) : Amean 12 16.6577 ( 0.00%) 18.2363 * -9.48%* 17.1083 ( -2.71%) : Amean 18 26.5133 ( 0.00%) 27.8833 * -5.17%* 25.7663 ( 2.82%) : Amean 24 34.3003 ( 0.00%) 34.6830 ( -1.12%) 32.0450 ( 6.58%) : Amean 30 40.0063 ( 0.00%) 40.5800 ( -1.43%) 41.5087 ( -3.76%) : Amean 32 40.1407 ( 0.00%) 41.2273 ( -2.71%) 39.9417 ( 0.50%) : : It's showing that the vanilla kernel takes a hit (as the bisection : indicated it would) and that disabling PSI by default is reasonably : close in terms of performance for this particular workload on this : particular machine so; Link: http://lkml.kernel.org/r/20181127165329.GA29728@cmpxchg.org Signed-off-by: Johannes Weiner Tested-by: Mel Gorman Reported-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ include/linux/psi.h | 3 ++- init/Kconfig | 9 ++++++++ kernel/sched/psi.c | 30 +++++++++++++++++-------- kernel/sched/stats.h | 8 +++---- 5 files changed, 40 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 675170c36078..5d6ba930d4f4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3505,6 +3505,10 @@ before loading. See Documentation/blockdev/ramdisk.txt. + psi= [KNL] Enable or disable pressure stall information + tracking. + Format: + psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to probe for; one of (bare|imps|exps|lifebook|any). psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports diff --git a/include/linux/psi.h b/include/linux/psi.h index 8e0725aac0aa..7006008d5b72 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -1,6 +1,7 @@ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H +#include #include #include @@ -9,7 +10,7 @@ struct css_set; #ifdef CONFIG_PSI -extern bool psi_disabled; +extern struct static_key_false psi_disabled; void psi_init(void); diff --git a/init/Kconfig b/init/Kconfig index a4112e95724a..cf5b5a0dcbc2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -509,6 +509,15 @@ config PSI Say N if unsure. +config PSI_DEFAULT_DISABLED + bool "Require boot parameter to enable pressure stall information tracking" + default n + depends on PSI + help + If set, pressure stall information tracking will be disabled + per default but can be enabled through passing psi_enable=1 + on the kernel commandline during boot. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 3d7355d7c3e3..fe24de3fbc93 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,8 +136,18 @@ static int psi_bug __read_mostly; -bool psi_disabled __read_mostly; -core_param(psi_disabled, psi_disabled, bool, 0644); +DEFINE_STATIC_KEY_FALSE(psi_disabled); + +#ifdef CONFIG_PSI_DEFAULT_DISABLED +bool psi_enable; +#else +bool psi_enable = true; +#endif +static int __init setup_psi(char *str) +{ + return kstrtobool(str, &psi_enable) == 0; +} +__setup("psi=", setup_psi); /* Running averages - we need to be higher-res than loadavg */ #define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ @@ -169,8 +179,10 @@ static void group_init(struct psi_group *group) void __init psi_init(void) { - if (psi_disabled) + if (!psi_enable) { + static_branch_enable(&psi_disabled); return; + } psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); @@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags) struct rq_flags rf; struct rq *rq; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; *flags = current->flags & PF_MEMSTALL; @@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags) struct rq_flags rf; struct rq *rq; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (*flags) @@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags) #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return 0; cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); @@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) void psi_cgroup_free(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; cancel_delayed_work_sync(&cgroup->psi.clock_work); @@ -637,7 +649,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (psi_disabled) { + if (static_branch_likely(&psi_disabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -673,7 +685,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; update_stats(group); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4904c4677000..aa0de240fb41 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -66,7 +66,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) { int clear = 0, set = TSK_RUNNING; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (!wakeup || p->sched_psi_wake_requeue) { @@ -86,7 +86,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) { int clear = TSK_RUNNING, set = 0; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (!sleep) { @@ -102,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) static inline void psi_ttwu_dequeue(struct task_struct *p) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; /* * Is the task being migrated during a wakeup? Make sure to @@ -128,7 +128,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) static inline void psi_task_tick(struct rq *rq) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (unlikely(rq->curr->flags & PF_MEMSTALL)) -- cgit v1.2.3-71-gd317 From b7df9ada9a7700dbcca1ba53d217c01e3d48179c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 1 Dec 2018 01:18:53 +0100 Subject: bpf: fix pointer offsets in context for 32 bit Currently, pointer offsets in three BPF context structures are broken in two scenarios: i) 32 bit compiled applications running on 64 bit kernels, and ii) LLVM compiled BPF programs running on 32 bit kernels. The latter is due to BPF target machine being strictly 64 bit. So in each of the cases the offsets will mismatch in verifier when checking / rewriting context access. Fix this by providing a helper macro __bpf_md_ptr() that will enforce padding up to 64 bit and proper alignment, and for context access a macro bpf_ctx_range_ptr() which will cover full 64 bit member range on 32 bit archs. For flow_keys, we additionally need to force the size check to sizeof(__u64) as with other pointer types. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Reported-by: David S. Miller Signed-off-by: Daniel Borkmann Acked-by: David S. Miller Tested-by: David S. Miller Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 7 +++++++ include/uapi/linux/bpf.h | 17 ++++++++++++----- net/core/filter.c | 16 ++++++++-------- tools/include/uapi/linux/bpf.h | 17 ++++++++++++----- 4 files changed, 39 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 448dcc448f1f..795ff0b869bb 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -449,6 +449,13 @@ struct sock_reuseport; offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 +#if BITS_PER_LONG == 64 +# define bpf_ctx_range_ptr(TYPE, MEMBER) \ + offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 +#else +# define bpf_ctx_range_ptr(TYPE, MEMBER) \ + offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1 +#endif /* BITS_PER_LONG == 64 */ #define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ ({ \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 852dc17ab47a..426b5c8a245b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2422,6 +2422,12 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6_INLINE }; +#define __bpf_md_ptr(type, name) \ +union { \ + type name; \ + __u64 :64; \ +} __attribute__((aligned(8))) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -2456,7 +2462,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; - struct bpf_flow_keys *flow_keys; + __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); }; struct bpf_tunnel_key { @@ -2572,8 +2578,8 @@ enum sk_action { * be added to the end of this structure */ struct sk_msg_md { - void *data; - void *data_end; + __bpf_md_ptr(void *, data); + __bpf_md_ptr(void *, data_end); __u32 family; __u32 remote_ip4; /* Stored in network byte order */ @@ -2589,8 +2595,9 @@ struct sk_reuseport_md { * Start of directly accessible data. It begins from * the tcp/udp header. */ - void *data; - void *data_end; /* End of directly accessible data */ + __bpf_md_ptr(void *, data); + /* End of directly accessible data */ + __bpf_md_ptr(void *, data_end); /* * Total length of packet (starting from the tcp/udp header). * Note that the directly accessible bytes (data_end - data) diff --git a/net/core/filter.c b/net/core/filter.c index 9a1327eb25fa..6ee605da990f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5435,8 +5435,8 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): - if (size != sizeof(struct bpf_flow_keys *)) + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + if (size != sizeof(__u64)) return false; break; default: @@ -5464,7 +5464,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5489,7 +5489,7 @@ static bool cg_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): @@ -5530,7 +5530,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; } @@ -5756,7 +5756,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5958,7 +5958,7 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; } @@ -6039,7 +6039,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range(struct __sk_buff, flow_keys): + case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): info->reg_type = PTR_TO_FLOW_KEYS; break; case bpf_ctx_range(struct __sk_buff, tc_classid): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 852dc17ab47a..426b5c8a245b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2422,6 +2422,12 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6_INLINE }; +#define __bpf_md_ptr(type, name) \ +union { \ + type name; \ + __u64 :64; \ +} __attribute__((aligned(8))) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -2456,7 +2462,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; - struct bpf_flow_keys *flow_keys; + __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); }; struct bpf_tunnel_key { @@ -2572,8 +2578,8 @@ enum sk_action { * be added to the end of this structure */ struct sk_msg_md { - void *data; - void *data_end; + __bpf_md_ptr(void *, data); + __bpf_md_ptr(void *, data_end); __u32 family; __u32 remote_ip4; /* Stored in network byte order */ @@ -2589,8 +2595,9 @@ struct sk_reuseport_md { * Start of directly accessible data. It begins from * the tcp/udp header. */ - void *data; - void *data_end; /* End of directly accessible data */ + __bpf_md_ptr(void *, data); + /* End of directly accessible data */ + __bpf_md_ptr(void *, data_end); /* * Total length of packet (starting from the tcp/udp header). * Note that the directly accessible bytes (data_end - data) -- cgit v1.2.3-71-gd317 From 71700bb96047f68a0aae3932466fc7c9ad5ce6c0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 30 Nov 2018 16:11:15 -0500 Subject: SUNRPC: Fix a memory leak in call_encode() If we retransmit an RPC request, we currently end up clobbering the value of req->rq_rcv_buf.bvec that was allocated by the initial call to xprt_request_prepare(req). Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 1 - net/sunrpc/clnt.c | 1 + net/sunrpc/xprt.c | 2 ++ 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 43106ffa6788..2ec128060239 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -72,7 +72,6 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) buf->head[0].iov_base = start; buf->head[0].iov_len = len; buf->tail[0].iov_len = 0; - buf->bvec = NULL; buf->pages = NULL; buf->page_len = 0; buf->flags = 0; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index e35d642558e7..c6782aa47525 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2309,6 +2309,7 @@ out_retry: task->tk_status = 0; /* Note: rpc_verify_header() may have freed the RPC slot */ if (task->tk_rqstp == req) { + xdr_free_bvec(&req->rq_rcv_buf); req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) xprt_conditional_disconnect(req->rq_xprt, diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 86bea4520c4d..122c91c28e7c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1623,6 +1623,8 @@ xprt_request_init(struct rpc_task *task) req->rq_snd_buf.buflen = 0; req->rq_rcv_buf.len = 0; req->rq_rcv_buf.buflen = 0; + req->rq_snd_buf.bvec = NULL; + req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; xprt_reset_majortimeo(req); dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, -- cgit v1.2.3-71-gd317 From 37c2578c0c40e286bc0d30bdc05290b2058cf66e Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 3 Dec 2018 00:54:35 +0000 Subject: Drivers: hv: vmbus: Offload the handling of channels to two workqueues vmbus_process_offer() mustn't call channel->sc_creation_callback() directly for sub-channels, because sc_creation_callback() -> vmbus_open() may never get the host's response to the OPEN_CHANNEL message (the host may rescind a channel at any time, e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind() may not wake up the vmbus_open() as it's blocked due to a non-zero vmbus_connection.offer_in_progress, and finally we have a deadlock. The above is also true for primary channels, if the related device drivers use sync probing mode by default. And, usually the handling of primary channels and sub-channels can depend on each other, so we should offload them to different workqueues to avoid possible deadlock, e.g. in sync-probing mode, NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() -> rtnl_lock(), and causes deadlock: the former gets the rtnl_lock and waits for all the sub-channels to appear, but the latter can't get the rtnl_lock and this blocks the handling of sub-channels. The patch can fix the multiple-NIC deadlock described above for v3.x kernels (e.g. RHEL 7.x) which don't support async-probing of devices, and v4.4, v4.9, v4.14 and v4.18 which support async-probing but don't enable async-probing for Hyper-V drivers (yet). The patch can also fix the hang issue in sub-channel's handling described above for all versions of kernels, including v4.19 and v4.20-rc4. So actually the patch should be applied to all the existing kernels, not only the kernels that have 8195b1396ec8. Fixes: 8195b1396ec8 ("hv_netvsc: fix deadlock on hotplug") Cc: stable@vger.kernel.org Cc: Stephen Hemminger Cc: K. Y. Srinivasan Cc: Haiyang Zhang Signed-off-by: Dexuan Cui Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel_mgmt.c | 189 ++++++++++++++++++++++++++++++---------------- drivers/hv/connection.c | 24 +++++- drivers/hv/hyperv_vmbus.h | 7 ++ include/linux/hyperv.h | 7 ++ 4 files changed, 161 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 6277597d3d58..edd34c167a9b 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -435,61 +435,16 @@ void vmbus_free_channels(void) } } -/* - * vmbus_process_offer - Process the offer by creating a channel/device - * associated with this offer - */ -static void vmbus_process_offer(struct vmbus_channel *newchannel) +/* Note: the function can run concurrently for primary/sub channels. */ +static void vmbus_add_channel_work(struct work_struct *work) { - struct vmbus_channel *channel; - bool fnew = true; + struct vmbus_channel *newchannel = + container_of(work, struct vmbus_channel, add_channel_work); + struct vmbus_channel *primary_channel = newchannel->primary_channel; unsigned long flags; u16 dev_type; int ret; - /* Make sure this is a new offer */ - mutex_lock(&vmbus_connection.channel_mutex); - - /* - * Now that we have acquired the channel_mutex, - * we can release the potentially racing rescind thread. - */ - atomic_dec(&vmbus_connection.offer_in_progress); - - list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { - if (!uuid_le_cmp(channel->offermsg.offer.if_type, - newchannel->offermsg.offer.if_type) && - !uuid_le_cmp(channel->offermsg.offer.if_instance, - newchannel->offermsg.offer.if_instance)) { - fnew = false; - break; - } - } - - if (fnew) - list_add_tail(&newchannel->listentry, - &vmbus_connection.chn_list); - - mutex_unlock(&vmbus_connection.channel_mutex); - - if (!fnew) { - /* - * Check to see if this is a sub-channel. - */ - if (newchannel->offermsg.offer.sub_channel_index != 0) { - /* - * Process the sub-channel. - */ - newchannel->primary_channel = channel; - spin_lock_irqsave(&channel->lock, flags); - list_add_tail(&newchannel->sc_list, &channel->sc_list); - channel->num_sc++; - spin_unlock_irqrestore(&channel->lock, flags); - } else { - goto err_free_chan; - } - } - dev_type = hv_get_dev_type(newchannel); init_vp_index(newchannel, dev_type); @@ -507,27 +462,26 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) /* * This state is used to indicate a successful open * so that when we do close the channel normally, we - * can cleanup properly + * can cleanup properly. */ newchannel->state = CHANNEL_OPEN_STATE; - if (!fnew) { - struct hv_device *dev - = newchannel->primary_channel->device_obj; + if (primary_channel != NULL) { + /* newchannel is a sub-channel. */ + struct hv_device *dev = primary_channel->device_obj; if (vmbus_add_channel_kobj(dev, newchannel)) - goto err_free_chan; + goto err_deq_chan; + + if (primary_channel->sc_creation_callback != NULL) + primary_channel->sc_creation_callback(newchannel); - if (channel->sc_creation_callback != NULL) - channel->sc_creation_callback(newchannel); newchannel->probe_done = true; return; } /* - * Start the process of binding this offer to the driver - * We need to set the DeviceObject field before calling - * vmbus_child_dev_add() + * Start the process of binding the primary channel to the driver */ newchannel->device_obj = vmbus_device_create( &newchannel->offermsg.offer.if_type, @@ -556,13 +510,28 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) err_deq_chan: mutex_lock(&vmbus_connection.channel_mutex); - list_del(&newchannel->listentry); + + /* + * We need to set the flag, otherwise + * vmbus_onoffer_rescind() can be blocked. + */ + newchannel->probe_done = true; + + if (primary_channel == NULL) { + list_del(&newchannel->listentry); + } else { + spin_lock_irqsave(&primary_channel->lock, flags); + list_del(&newchannel->sc_list); + spin_unlock_irqrestore(&primary_channel->lock, flags); + } + mutex_unlock(&vmbus_connection.channel_mutex); if (newchannel->target_cpu != get_cpu()) { put_cpu(); smp_call_function_single(newchannel->target_cpu, - percpu_channel_deq, newchannel, true); + percpu_channel_deq, + newchannel, true); } else { percpu_channel_deq(newchannel); put_cpu(); @@ -570,14 +539,104 @@ err_deq_chan: vmbus_release_relid(newchannel->offermsg.child_relid); -err_free_chan: free_channel(newchannel); } +/* + * vmbus_process_offer - Process the offer by creating a channel/device + * associated with this offer + */ +static void vmbus_process_offer(struct vmbus_channel *newchannel) +{ + struct vmbus_channel *channel; + struct workqueue_struct *wq; + unsigned long flags; + bool fnew = true; + + mutex_lock(&vmbus_connection.channel_mutex); + + /* + * Now that we have acquired the channel_mutex, + * we can release the potentially racing rescind thread. + */ + atomic_dec(&vmbus_connection.offer_in_progress); + + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!uuid_le_cmp(channel->offermsg.offer.if_type, + newchannel->offermsg.offer.if_type) && + !uuid_le_cmp(channel->offermsg.offer.if_instance, + newchannel->offermsg.offer.if_instance)) { + fnew = false; + break; + } + } + + if (fnew) + list_add_tail(&newchannel->listentry, + &vmbus_connection.chn_list); + else { + /* + * Check to see if this is a valid sub-channel. + */ + if (newchannel->offermsg.offer.sub_channel_index == 0) { + mutex_unlock(&vmbus_connection.channel_mutex); + /* + * Don't call free_channel(), because newchannel->kobj + * is not initialized yet. + */ + kfree(newchannel); + WARN_ON_ONCE(1); + return; + } + /* + * Process the sub-channel. + */ + newchannel->primary_channel = channel; + spin_lock_irqsave(&channel->lock, flags); + list_add_tail(&newchannel->sc_list, &channel->sc_list); + spin_unlock_irqrestore(&channel->lock, flags); + } + + mutex_unlock(&vmbus_connection.channel_mutex); + + /* + * vmbus_process_offer() mustn't call channel->sc_creation_callback() + * directly for sub-channels, because sc_creation_callback() -> + * vmbus_open() may never get the host's response to the + * OPEN_CHANNEL message (the host may rescind a channel at any time, + * e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind() + * may not wake up the vmbus_open() as it's blocked due to a non-zero + * vmbus_connection.offer_in_progress, and finally we have a deadlock. + * + * The above is also true for primary channels, if the related device + * drivers use sync probing mode by default. + * + * And, usually the handling of primary channels and sub-channels can + * depend on each other, so we should offload them to different + * workqueues to avoid possible deadlock, e.g. in sync-probing mode, + * NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() -> + * rtnl_lock(), and causes deadlock: the former gets the rtnl_lock + * and waits for all the sub-channels to appear, but the latter + * can't get the rtnl_lock and this blocks the handling of + * sub-channels. + */ + INIT_WORK(&newchannel->add_channel_work, vmbus_add_channel_work); + wq = fnew ? vmbus_connection.handle_primary_chan_wq : + vmbus_connection.handle_sub_chan_wq; + queue_work(wq, &newchannel->add_channel_work); +} + /* * We use this state to statically distribute the channel interrupt load. */ static int next_numa_node_id; +/* + * init_vp_index() accesses global variables like next_numa_node_id, and + * it can run concurrently for primary channels and sub-channels: see + * vmbus_process_offer(), so we need the lock to protect the global + * variables. + */ +static DEFINE_SPINLOCK(bind_channel_to_cpu_lock); /* * Starting with Win8, we can statically distribute the incoming @@ -613,6 +672,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) return; } + spin_lock(&bind_channel_to_cpu_lock); + /* * Based on the channel affinity policy, we will assign the NUMA * nodes. @@ -695,6 +756,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) channel->target_cpu = cur_cpu; channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu); + spin_unlock(&bind_channel_to_cpu_lock); + free_cpumask_var(available_mask); } diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index f4d08c8ac7f8..4fe117b761ce 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -190,6 +190,20 @@ int vmbus_connect(void) goto cleanup; } + vmbus_connection.handle_primary_chan_wq = + create_workqueue("hv_pri_chan"); + if (!vmbus_connection.handle_primary_chan_wq) { + ret = -ENOMEM; + goto cleanup; + } + + vmbus_connection.handle_sub_chan_wq = + create_workqueue("hv_sub_chan"); + if (!vmbus_connection.handle_sub_chan_wq) { + ret = -ENOMEM; + goto cleanup; + } + INIT_LIST_HEAD(&vmbus_connection.chn_msg_list); spin_lock_init(&vmbus_connection.channelmsg_lock); @@ -280,10 +294,14 @@ void vmbus_disconnect(void) */ vmbus_initiate_unload(false); - if (vmbus_connection.work_queue) { - drain_workqueue(vmbus_connection.work_queue); + if (vmbus_connection.handle_sub_chan_wq) + destroy_workqueue(vmbus_connection.handle_sub_chan_wq); + + if (vmbus_connection.handle_primary_chan_wq) + destroy_workqueue(vmbus_connection.handle_primary_chan_wq); + + if (vmbus_connection.work_queue) destroy_workqueue(vmbus_connection.work_queue); - } if (vmbus_connection.int_page) { free_pages((unsigned long)vmbus_connection.int_page, 0); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 72eaba3d50fc..87d3d7da78f8 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -335,7 +335,14 @@ struct vmbus_connection { struct list_head chn_list; struct mutex channel_mutex; + /* + * An offer message is handled first on the work_queue, and then + * is further handled on handle_primary_chan_wq or + * handle_sub_chan_wq. + */ struct workqueue_struct *work_queue; + struct workqueue_struct *handle_primary_chan_wq; + struct workqueue_struct *handle_sub_chan_wq; }; diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b3e24368930a..14131b6fae68 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -905,6 +905,13 @@ struct vmbus_channel { bool probe_done; + /* + * We must offload the handling of the primary/sub channels + * from the single-threaded vmbus_connection.work_queue to + * two different workqueue, otherwise we can block + * vmbus_connection.work_queue and hang: see vmbus_process_offer(). + */ + struct work_struct add_channel_work; }; static inline bool is_hvsock_channel(const struct vmbus_channel *c) -- cgit v1.2.3-71-gd317 From 27359fd6e5f3c5db8fe544b63238b6170e8806d8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 30 Nov 2018 11:05:06 -0500 Subject: dax: Fix unlock mismatch with updated API Internal to dax_unlock_mapping_entry(), dax_unlock_entry() is used to store a replacement entry in the Xarray at the given xas-index with the DAX_LOCKED bit clear. When called, dax_unlock_entry() expects the unlocked value of the entry relative to the current Xarray state to be specified. In most contexts dax_unlock_entry() is operating in the same scope as the matched dax_lock_entry(). However, in the dax_unlock_mapping_entry() case the implementation needs to recall the original entry. In the case where the original entry is a 'pmd' entry it is possible that the pfn performed to do the lookup is misaligned to the value retrieved in the Xarray. Change the api to return the unlock cookie from dax_lock_page() and pass it to dax_unlock_page(). This fixes a bug where dax_unlock_page() was assuming that the page was PMD-aligned if the entry was a PMD entry with signatures like: WARNING: CPU: 38 PID: 1396 at fs/dax.c:340 dax_insert_entry+0x2b2/0x2d0 RIP: 0010:dax_insert_entry+0x2b2/0x2d0 [..] Call Trace: dax_iomap_pte_fault.isra.41+0x791/0xde0 ext4_dax_huge_fault+0x16f/0x1f0 ? up_read+0x1c/0xa0 __do_fault+0x1f/0x160 __handle_mm_fault+0x1033/0x1490 handle_mm_fault+0x18b/0x3d0 Link: https://lkml.kernel.org/r/20181130154902.GL10377@bombadil.infradead.org Fixes: 9f32d221301c ("dax: Convert dax_lock_mapping_entry to XArray") Reported-by: Dan Williams Signed-off-by: Matthew Wilcox Tested-by: Dan Williams Reviewed-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 21 ++++++++------------- include/linux/dax.h | 14 ++++++++------ mm/memory-failure.c | 6 ++++-- 3 files changed, 20 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/fs/dax.c b/fs/dax.c index 3f592dc18d67..48132eca3761 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -379,20 +379,20 @@ static struct page *dax_busy_page(void *entry) * @page: The page whose entry we want to lock * * Context: Process context. - * Return: %true if the entry was locked or does not need to be locked. + * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could + * not be locked. */ -bool dax_lock_mapping_entry(struct page *page) +dax_entry_t dax_lock_page(struct page *page) { XA_STATE(xas, NULL, 0); void *entry; - bool locked; /* Ensure page->mapping isn't freed while we look at it */ rcu_read_lock(); for (;;) { struct address_space *mapping = READ_ONCE(page->mapping); - locked = false; + entry = NULL; if (!mapping || !dax_mapping(mapping)) break; @@ -403,7 +403,7 @@ bool dax_lock_mapping_entry(struct page *page) * otherwise we would not have a valid pfn_to_page() * translation. */ - locked = true; + entry = (void *)~0UL; if (S_ISCHR(mapping->host->i_mode)) break; @@ -426,23 +426,18 @@ bool dax_lock_mapping_entry(struct page *page) break; } rcu_read_unlock(); - return locked; + return (dax_entry_t)entry; } -void dax_unlock_mapping_entry(struct page *page) +void dax_unlock_page(struct page *page, dax_entry_t cookie) { struct address_space *mapping = page->mapping; XA_STATE(xas, &mapping->i_pages, page->index); - void *entry; if (S_ISCHR(mapping->host->i_mode)) return; - rcu_read_lock(); - entry = xas_load(&xas); - rcu_read_unlock(); - entry = dax_make_entry(page_to_pfn_t(page), dax_is_pmd_entry(entry)); - dax_unlock_entry(&xas, entry); + dax_unlock_entry(&xas, (void *)cookie); } /* diff --git a/include/linux/dax.h b/include/linux/dax.h index 450b28db9533..0dd316a74a29 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -7,6 +7,8 @@ #include #include +typedef unsigned long dax_entry_t; + struct iomap_ops; struct dax_device; struct dax_operations { @@ -88,8 +90,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc); struct page *dax_layout_busy_page(struct address_space *mapping); -bool dax_lock_mapping_entry(struct page *page); -void dax_unlock_mapping_entry(struct page *page); +dax_entry_t dax_lock_page(struct page *page); +void dax_unlock_page(struct page *page, dax_entry_t cookie); #else static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize) @@ -122,14 +124,14 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping, return -EOPNOTSUPP; } -static inline bool dax_lock_mapping_entry(struct page *page) +static inline dax_entry_t dax_lock_page(struct page *page) { if (IS_DAX(page->mapping->host)) - return true; - return false; + return ~0UL; + return 0; } -static inline void dax_unlock_mapping_entry(struct page *page) +static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) { } #endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0cd3de3550f0..7c72f2a95785 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1161,6 +1161,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, LIST_HEAD(tokill); int rc = -EBUSY; loff_t start; + dax_entry_t cookie; /* * Prevent the inode from being freed while we are interrogating @@ -1169,7 +1170,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, * also prevents changes to the mapping of this pfn until * poison signaling is complete. */ - if (!dax_lock_mapping_entry(page)) + cookie = dax_lock_page(page); + if (!cookie) goto out; if (hwpoison_filter(page)) { @@ -1220,7 +1222,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); rc = 0; unlock: - dax_unlock_mapping_entry(page); + dax_unlock_page(page, cookie); out: /* drop pgmap ref acquired in caller */ put_dev_pagemap(pgmap); -- cgit v1.2.3-71-gd317 From f51ccf46217c28758b1f3b5bc0ccfc00eca658b2 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 4 Dec 2018 17:00:36 +0100 Subject: USB: serial: console: fix reported terminal settings The USB-serial console implementation has never reported the actual terminal settings used. Despite storing the corresponding cflags in its struct console, these were never honoured on later tty open() where the tty termios would be left initialised to the driver defaults. Unlike the serial console implementation, the USB-serial code calls subdriver open() already at console setup. While calling set_termios() and write() before open() looks like it could work for some USB-serial drivers, others definitely do not expect this, so modelling this after serial core is going to be intrusive, if at all possible. Instead, use a (renamed) tty helper to save the termios data used at console setup so that the tty termios reflects the actual terminal settings after a subsequent tty open(). Note that the calls to tty_init_termios() (tty_driver_install()) and tty_save_termios() are serialised using the disconnect mutex. This specifically fixes a regression that was triggered by a recent change adding software flow control to the pl2303 driver: a getty trying to disable flow control while leaving the baud rate unchanged would now also set the baud rate to the driver default (prior to the flow-control change this had been a noop). Fixes: 7041d9c3f01b ("USB: serial: pl2303: add support for tx xon/xoff flow control") Cc: stable # 4.18 Cc: Florian Zumbiehl Reported-by: Jarkko Nikula Tested-by: Jarkko Nikula Acked-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold --- drivers/tty/tty_io.c | 11 +++++++++-- drivers/usb/serial/console.c | 2 +- include/linux/tty.h | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index ee80dfbd5442..687250ec8032 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1373,7 +1373,13 @@ err_release_lock: return ERR_PTR(retval); } -static void tty_free_termios(struct tty_struct *tty) +/** + * tty_save_termios() - save tty termios data in driver table + * @tty: tty whose termios data to save + * + * Locking: Caller guarantees serialisation with tty_init_termios(). + */ +void tty_save_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; @@ -1392,6 +1398,7 @@ static void tty_free_termios(struct tty_struct *tty) } *tp = tty->termios; } +EXPORT_SYMBOL_GPL(tty_save_termios); /** * tty_flush_works - flush all works of a tty/pty pair @@ -1491,7 +1498,7 @@ static void release_tty(struct tty_struct *tty, int idx) WARN_ON(!mutex_is_locked(&tty_mutex)); if (tty->ops->shutdown) tty->ops->shutdown(tty); - tty_free_termios(tty); + tty_save_termios(tty); tty_driver_remove_tty(tty->driver, tty); tty->port->itty = NULL; if (tty->link) diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c index 17940589c647..7d289302ff6c 100644 --- a/drivers/usb/serial/console.c +++ b/drivers/usb/serial/console.c @@ -101,7 +101,6 @@ static int usb_console_setup(struct console *co, char *options) cflag |= PARENB; break; } - co->cflag = cflag; /* * no need to check the index here: if the index is wrong, console @@ -164,6 +163,7 @@ static int usb_console_setup(struct console *co, char *options) serial->type->set_termios(tty, port, &dummy); tty_port_tty_set(&port->port, NULL); + tty_save_termios(tty); tty_kref_put(tty); } tty_port_set_initialized(&port->port, 1); diff --git a/include/linux/tty.h b/include/linux/tty.h index 414db2bce715..392138fe59b6 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -556,6 +556,7 @@ extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); extern void tty_release_struct(struct tty_struct *tty, int idx); extern int tty_release(struct inode *inode, struct file *filp); extern void tty_init_termios(struct tty_struct *tty); +extern void tty_save_termios(struct tty_struct *tty); extern int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty); -- cgit v1.2.3-71-gd317 From 2fc00c1e0f9d2abe0df74c33cf9f40d12b9b892f Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Mon, 3 Dec 2018 14:46:20 +0800 Subject: HID: use macros in IS_INPUT_APPLICATION Add missing definition for HID_DG_WHITEBOARD then replace the hid usage hex with macros for better readibility. Signed-off-by: Chris Chiu Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index a355d61940f2..ce5f996c8d3d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -238,6 +238,7 @@ struct hid_item { #define HID_DG_LIGHTPEN 0x000d0003 #define HID_DG_TOUCHSCREEN 0x000d0004 #define HID_DG_TOUCHPAD 0x000d0005 +#define HID_DG_WHITEBOARD 0x000d0006 #define HID_DG_STYLUS 0x000d0020 #define HID_DG_PUCK 0x000d0021 #define HID_DG_FINGER 0x000d0022 @@ -836,7 +837,10 @@ static inline bool hid_is_using_ll_driver(struct hid_device *hdev, /* Applications from HID Usage Tables 4/8/99 Version 1.1 */ /* We ignore a few input applications that are not widely used */ -#define IS_INPUT_APPLICATION(a) (((a >= 0x00010000) && (a <= 0x00010008)) || (a == 0x00010080) || (a == 0x000c0001) || ((a >= 0x000d0002) && (a <= 0x000d0006))) +#define IS_INPUT_APPLICATION(a) \ + (((a >= HID_UP_GENDESK) && (a <= HID_GD_MULTIAXIS)) \ + || ((a >= HID_DG_PEN) && (a <= HID_DG_WHITEBOARD)) \ + || (a == HID_GD_SYSTEM_CONTROL) || (a == HID_CP_CONSUMER_CONTROL)) /* HID core API */ -- cgit v1.2.3-71-gd317 From 7f5592742a429b4de770fc5b796d18de43a15fdc Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Mon, 3 Dec 2018 14:46:21 +0800 Subject: HID: input: support Microsoft wireless radio control hotkey The ASUS laptops start to support the airplane mode radio management to replace the original mechanism of airplane mode toggle hotkey. On the ASUS P5440FF, it presents as a HID device connecting via I2C, named i2c-AMPD0001. When pressing it, the Embedded Controller send hid report via I2C and switch the airplane mode indicator LED based on the status. However, it's not working because it fails to be identified as a hidinput device. It fails in hidinput_connect() due to the macro IS_INPUT_APPLICATION doesn't have HID_GD_WIRELESS_RADIO_CTLS as a legit application code. It's easy to add the HID I2C vendor and product id to the quirk list and apply HID_QUIRK_HIDINPUT_FORCE to make it work. But it makes more sense to support it as a generic input application. Signed-off-by: Chris Chiu Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index ce5f996c8d3d..42079116fb61 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -840,7 +840,8 @@ static inline bool hid_is_using_ll_driver(struct hid_device *hdev, #define IS_INPUT_APPLICATION(a) \ (((a >= HID_UP_GENDESK) && (a <= HID_GD_MULTIAXIS)) \ || ((a >= HID_DG_PEN) && (a <= HID_DG_WHITEBOARD)) \ - || (a == HID_GD_SYSTEM_CONTROL) || (a == HID_CP_CONSUMER_CONTROL)) + || (a == HID_GD_SYSTEM_CONTROL) || (a == HID_CP_CONSUMER_CONTROL) \ + || (a == HID_GD_WIRELESS_RADIO_CTLS)) /* HID core API */ -- cgit v1.2.3-71-gd317 From 704620afc70cf47abb9d6a1a57f3825d2bca49cf Mon Sep 17 00:00:00 2001 From: Mathias Payer Date: Wed, 5 Dec 2018 21:19:59 +0100 Subject: USB: check usb_get_extra_descriptor for proper size When reading an extra descriptor, we need to properly check the minimum and maximum size allowed, to prevent from invalid data being sent by a device. Reported-by: Hui Peng Reported-by: Mathias Payer Co-developed-by: Linus Torvalds Signed-off-by: Hui Peng Signed-off-by: Mathias Payer Signed-off-by: Linus Torvalds Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 2 +- drivers/usb/core/usb.c | 6 +++--- drivers/usb/host/hwa-hc.c | 2 +- include/linux/usb.h | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 528664059a12..f76b2e0aba9d 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -2251,7 +2251,7 @@ static int usb_enumerate_device_otg(struct usb_device *udev) /* descriptor may appear anywhere in config */ err = __usb_get_extra_descriptor(udev->rawdescriptors[0], le16_to_cpu(udev->config[0].desc.wTotalLength), - USB_DT_OTG, (void **) &desc); + USB_DT_OTG, (void **) &desc, sizeof(*desc)); if (err || !(desc->bmAttributes & USB_OTG_HNP)) return 0; diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index 79d8bd7a612e..4ebfbd737905 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -832,14 +832,14 @@ EXPORT_SYMBOL_GPL(usb_get_current_frame_number); */ int __usb_get_extra_descriptor(char *buffer, unsigned size, - unsigned char type, void **ptr) + unsigned char type, void **ptr, size_t minsize) { struct usb_descriptor_header *header; while (size >= sizeof(struct usb_descriptor_header)) { header = (struct usb_descriptor_header *)buffer; - if (header->bLength < 2) { + if (header->bLength < 2 || header->bLength > size) { printk(KERN_ERR "%s: bogus descriptor, type %d length %d\n", usbcore_name, @@ -848,7 +848,7 @@ int __usb_get_extra_descriptor(char *buffer, unsigned size, return -1; } - if (header->bDescriptorType == type) { + if (header->bDescriptorType == type && header->bLength >= minsize) { *ptr = header; return 0; } diff --git a/drivers/usb/host/hwa-hc.c b/drivers/usb/host/hwa-hc.c index 684d6f074c3a..09a8ebd95588 100644 --- a/drivers/usb/host/hwa-hc.c +++ b/drivers/usb/host/hwa-hc.c @@ -640,7 +640,7 @@ static int hwahc_security_create(struct hwahc *hwahc) top = itr + itr_size; result = __usb_get_extra_descriptor(usb_dev->rawdescriptors[index], le16_to_cpu(usb_dev->actconfig->desc.wTotalLength), - USB_DT_SECURITY, (void **) &secd); + USB_DT_SECURITY, (void **) &secd, sizeof(*secd)); if (result == -1) { dev_warn(dev, "BUG? WUSB host has no security descriptors\n"); return 0; diff --git a/include/linux/usb.h b/include/linux/usb.h index 4cdd515a4385..5e49e82c4368 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -407,11 +407,11 @@ struct usb_host_bos { }; int __usb_get_extra_descriptor(char *buffer, unsigned size, - unsigned char type, void **ptr); + unsigned char type, void **ptr, size_t min); #define usb_get_extra_descriptor(ifpoint, type, ptr) \ __usb_get_extra_descriptor((ifpoint)->extra, \ (ifpoint)->extralen, \ - type, (void **)ptr) + type, (void **)ptr, sizeof(**(ptr))) /* ----------------------------------------------------------------------- */ -- cgit v1.2.3-71-gd317 From 2f0799a0ffc033bf3cc82d5032acc3ec633464c2 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 5 Dec 2018 15:45:54 -0800 Subject: mm, thp: restore node-local hugepage allocations This is a full revert of ac5b2c18911f ("mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings") and a partial revert of 89c83fb539f9 ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask"). By not setting __GFP_THISNODE, applications can allocate remote hugepages when the local node is fragmented or low on memory when either the thp defrag setting is "always" or the vma has been madvised with MADV_HUGEPAGE. Remote access to hugepages often has much higher latency than local pages of the native page size. On Haswell, ac5b2c18911f was shown to have a 13.9% access regression after this commit for binaries that remap their text segment to be backed by transparent hugepages. The intent of ac5b2c18911f is to address an issue where a local node is low on memory or fragmented such that a hugepage cannot be allocated. In every scenario where this was described as a fix, there is abundant and unfragmented remote memory available to allocate from, even with a greater access latency. If remote memory is also low or fragmented, not setting __GFP_THISNODE was also measured on Haswell to have a 40% regression in allocation latency. Restore __GFP_THISNODE for thp allocations. Fixes: ac5b2c18911f ("mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings") Fixes: 89c83fb539f9 ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask") Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Cc: Andrew Morton Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 -- mm/huge_memory.c | 42 ++++++++++++++++-------------------------- mm/mempolicy.c | 2 +- 3 files changed, 17 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index bac395f1d00a..5228c62af416 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -139,8 +139,6 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); -struct mempolicy *get_vma_policy(struct vm_area_struct *vma, - unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 622cced74fd9..f2d19e4fe854 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -632,37 +632,27 @@ release: static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - gfp_t this_node = 0; - -#ifdef CONFIG_NUMA - struct mempolicy *pol; - /* - * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not - * specified, to express a general desire to stay on the current - * node for optimistic allocation attempts. If the defrag mode - * and/or madvise hint requires the direct reclaim then we prefer - * to fallback to other node rather than node reclaim because that - * can lead to excessive reclaim even though there is free memory - * on other nodes. We expect that NUMA preferences are specified - * by memory policies. - */ - pol = get_vma_policy(vma, addr); - if (pol->mode != MPOL_BIND) - this_node = __GFP_THISNODE; - mpol_cond_put(pol); -#endif + const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; + /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + return GFP_TRANSHUGE | __GFP_THISNODE | + (vma_madvised ? 0 : __GFP_NORETRY); + + /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; + return gfp_mask | __GFP_KSWAPD_RECLAIM; + + /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM | this_node); + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + + /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : - this_node); - return GFP_TRANSHUGE_LIGHT | this_node; + return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + + return gfp_mask; } /* Caller must hold page table lock. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5837a067124d..69e278b469ef 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -struct mempolicy *get_vma_policy(struct vm_area_struct *vma, +static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); -- cgit v1.2.3-71-gd317 From c53431eb696f3c64c12c00afb81048af54b61532 Mon Sep 17 00:00:00 2001 From: Peter Hutterer Date: Wed, 5 Dec 2018 10:42:22 +1000 Subject: HID: core: store the collections as a basic tree For each collection parsed, store a pointer to the parent collection (if any). This makes it a lot easier to look up which collection(s) any given item is part of Signed-off-by: Peter Hutterer Verified-by: Harry Cutts Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-core.c | 4 ++++ include/linux/hid.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 5bec9244c45b..43d488a45120 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -172,6 +172,8 @@ static int open_collection(struct hid_parser *parser, unsigned type) collection->type = type; collection->usage = usage; collection->level = parser->collection_stack_ptr - 1; + collection->parent = parser->active_collection; + parser->active_collection = collection; if (type == HID_COLLECTION_APPLICATION) parser->device->maxapplication++; @@ -190,6 +192,8 @@ static int close_collection(struct hid_parser *parser) return -EINVAL; } parser->collection_stack_ptr--; + if (parser->active_collection) + parser->active_collection = parser->active_collection->parent; return 0; } diff --git a/include/linux/hid.h b/include/linux/hid.h index a355d61940f2..fdfda898656c 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -427,6 +427,7 @@ struct hid_local { */ struct hid_collection { + struct hid_collection *parent; unsigned type; unsigned usage; unsigned level; @@ -650,6 +651,7 @@ struct hid_parser { unsigned int *collection_stack; unsigned int collection_stack_ptr; unsigned int collection_stack_size; + struct hid_collection *active_collection; struct hid_device *device; unsigned int scan_flags; }; -- cgit v1.2.3-71-gd317 From 5a4abb36f312cf83206b1b7d1308ba47cba0b3cc Mon Sep 17 00:00:00 2001 From: Peter Hutterer Date: Wed, 5 Dec 2018 10:42:23 +1000 Subject: HID: core: process the Resolution Multiplier The Resolution Multiplier is a feature report that modifies the value of Usages within the same Logical Collection. If the multiplier is set to anything but 1, the hardware reports (value * multiplier) for the same amount of physical movement, i.e. the value we receive in the kernel is pre-multiplied. The hardware may either send a single (value * multiplier), or by sending multiplier as many reports with the same value, or a combination of these two options. For example, when the Microsoft Sculpt Ergonomic mouse Resolution Multiplier is set to 12, the Wheel sends out 12 for every detent but AC Pan sends out a value of 3 at 4 times the frequency. The effective multiplier is based on the physical min/max of the multiplier field, a logical min/max of [0,1] with a physical min/max of [1,8] means the multiplier is either 1 or 8. The Resolution Multiplier was introduced for high-resolution scrolling in Windows Vista and is commonly used on Microsoft mice. The recommendation for the Resolution Multiplier is to default to 1 for backwards compatibility. This patch adds an arbitrary upper limit at 255. The only known use case for the Resolution Multiplier is for scroll wheels where the multiplier has to be a fraction of 120 to work with Windows. Signed-off-by: Peter Hutterer Verified-by: Harry Cutts Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-core.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/hid.h | 5 ++ 2 files changed, 175 insertions(+) (limited to 'include/linux') diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 43d488a45120..f41d5fe51abe 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -294,6 +294,7 @@ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsign field->usage[i].collection_index = parser->local.collection_index[j]; field->usage[i].usage_index = i; + field->usage[i].resolution_multiplier = 1; } field->maxusage = usages; @@ -947,6 +948,167 @@ struct hid_report *hid_validate_values(struct hid_device *hid, } EXPORT_SYMBOL_GPL(hid_validate_values); +static int hid_calculate_multiplier(struct hid_device *hid, + struct hid_field *multiplier) +{ + int m; + __s32 v = *multiplier->value; + __s32 lmin = multiplier->logical_minimum; + __s32 lmax = multiplier->logical_maximum; + __s32 pmin = multiplier->physical_minimum; + __s32 pmax = multiplier->physical_maximum; + + /* + * "Because OS implementations will generally divide the control's + * reported count by the Effective Resolution Multiplier, designers + * should take care not to establish a potential Effective + * Resolution Multiplier of zero." + * HID Usage Table, v1.12, Section 4.3.1, p31 + */ + if (lmax - lmin == 0) + return 1; + /* + * Handling the unit exponent is left as an exercise to whoever + * finds a device where that exponent is not 0. + */ + m = ((v - lmin)/(lmax - lmin) * (pmax - pmin) + pmin); + if (unlikely(multiplier->unit_exponent != 0)) { + hid_warn(hid, + "unsupported Resolution Multiplier unit exponent %d\n", + multiplier->unit_exponent); + } + + /* There are no devices with an effective multiplier > 255 */ + if (unlikely(m == 0 || m > 255 || m < -255)) { + hid_warn(hid, "unsupported Resolution Multiplier %d\n", m); + m = 1; + } + + return m; +} + +static void hid_apply_multiplier_to_field(struct hid_device *hid, + struct hid_field *field, + struct hid_collection *multiplier_collection, + int effective_multiplier) +{ + struct hid_collection *collection; + struct hid_usage *usage; + int i; + + /* + * If multiplier_collection is NULL, the multiplier applies + * to all fields in the report. + * Otherwise, it is the Logical Collection the multiplier applies to + * but our field may be in a subcollection of that collection. + */ + for (i = 0; i < field->maxusage; i++) { + usage = &field->usage[i]; + + collection = &hid->collection[usage->collection_index]; + while (collection && collection != multiplier_collection) + collection = collection->parent; + + if (collection || multiplier_collection == NULL) + usage->resolution_multiplier = effective_multiplier; + + } +} + +static void hid_apply_multiplier(struct hid_device *hid, + struct hid_field *multiplier) +{ + struct hid_report_enum *rep_enum; + struct hid_report *rep; + struct hid_field *field; + struct hid_collection *multiplier_collection; + int effective_multiplier; + int i; + + /* + * "The Resolution Multiplier control must be contained in the same + * Logical Collection as the control(s) to which it is to be applied. + * If no Resolution Multiplier is defined, then the Resolution + * Multiplier defaults to 1. If more than one control exists in a + * Logical Collection, the Resolution Multiplier is associated with + * all controls in the collection. If no Logical Collection is + * defined, the Resolution Multiplier is associated with all + * controls in the report." + * HID Usage Table, v1.12, Section 4.3.1, p30 + * + * Thus, search from the current collection upwards until we find a + * logical collection. Then search all fields for that same parent + * collection. Those are the fields the multiplier applies to. + * + * If we have more than one multiplier, it will overwrite the + * applicable fields later. + */ + multiplier_collection = &hid->collection[multiplier->usage->collection_index]; + while (multiplier_collection && + multiplier_collection->type != HID_COLLECTION_LOGICAL) + multiplier_collection = multiplier_collection->parent; + + effective_multiplier = hid_calculate_multiplier(hid, multiplier); + + rep_enum = &hid->report_enum[HID_INPUT_REPORT]; + list_for_each_entry(rep, &rep_enum->report_list, list) { + for (i = 0; i < rep->maxfield; i++) { + field = rep->field[i]; + hid_apply_multiplier_to_field(hid, field, + multiplier_collection, + effective_multiplier); + } + } +} + +/* + * hid_setup_resolution_multiplier - set up all resolution multipliers + * + * @device: hid device + * + * Search for all Resolution Multiplier Feature Reports and apply their + * value to all matching Input items. This only updates the internal struct + * fields. + * + * The Resolution Multiplier is applied by the hardware. If the multiplier + * is anything other than 1, the hardware will send pre-multiplied events + * so that the same physical interaction generates an accumulated + * accumulated_value = value * * multiplier + * This may be achieved by sending + * - "value * multiplier" for each event, or + * - "value" but "multiplier" times as frequently, or + * - a combination of the above + * The only guarantee is that the same physical interaction always generates + * an accumulated 'value * multiplier'. + * + * This function must be called before any event processing and after + * any SetRequest to the Resolution Multiplier. + */ +void hid_setup_resolution_multiplier(struct hid_device *hid) +{ + struct hid_report_enum *rep_enum; + struct hid_report *rep; + struct hid_usage *usage; + int i, j; + + rep_enum = &hid->report_enum[HID_FEATURE_REPORT]; + list_for_each_entry(rep, &rep_enum->report_list, list) { + for (i = 0; i < rep->maxfield; i++) { + /* Ignore if report count is out of bounds. */ + if (rep->field[i]->report_count < 1) + continue; + + for (j = 0; j < rep->field[i]->maxusage; j++) { + usage = &rep->field[i]->usage[j]; + if (usage->hid == HID_GD_RESOLUTION_MULTIPLIER) + hid_apply_multiplier(hid, + rep->field[i]); + } + } + } +} +EXPORT_SYMBOL_GPL(hid_setup_resolution_multiplier); + /** * hid_open_report - open a driver-specific device report * @@ -1043,9 +1205,17 @@ int hid_open_report(struct hid_device *device) hid_err(device, "unbalanced delimiter at end of report description\n"); goto err; } + + /* + * fetch initial values in case the device's + * default multiplier isn't the recommended 1 + */ + hid_setup_resolution_multiplier(device); + kfree(parser->collection_stack); vfree(parser); device->status |= HID_STAT_PARSED; + return 0; } } diff --git a/include/linux/hid.h b/include/linux/hid.h index fdfda898656c..fd8d860365a4 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -219,6 +219,7 @@ struct hid_item { #define HID_GD_VBRZ 0x00010045 #define HID_GD_VNO 0x00010046 #define HID_GD_FEATURE 0x00010047 +#define HID_GD_RESOLUTION_MULTIPLIER 0x00010048 #define HID_GD_SYSTEM_CONTROL 0x00010080 #define HID_GD_UP 0x00010090 #define HID_GD_DOWN 0x00010091 @@ -437,6 +438,8 @@ struct hid_usage { unsigned hid; /* hid usage code */ unsigned collection_index; /* index into collection array */ unsigned usage_index; /* index into usage array */ + __s8 resolution_multiplier;/* Effective Resolution Multiplier + (HUT v1.12, 4.3.1), default: 1 */ /* hidinput data */ __u16 code; /* input driver code */ __u8 type; /* input driver type */ @@ -894,6 +897,8 @@ struct hid_report *hid_validate_values(struct hid_device *hid, unsigned int type, unsigned int id, unsigned int field_index, unsigned int report_counts); + +void hid_setup_resolution_multiplier(struct hid_device *hid); int hid_open_report(struct hid_device *device); int hid_check_keys_pressed(struct hid_device *hid); int hid_connect(struct hid_device *hid, unsigned int connect_mask); -- cgit v1.2.3-71-gd317 From 2dc702c991e3774af9d7ce410eef410ca9e2357e Mon Sep 17 00:00:00 2001 From: Peter Hutterer Date: Wed, 5 Dec 2018 10:42:24 +1000 Subject: HID: input: use the Resolution Multiplier for high-resolution scrolling Windows uses a magic number of 120 for a wheel click. High-resolution scroll wheels are supposed to use a fraction of 120 to signal smaller scroll steps. This is implemented by the Resolution Multiplier in the device itself. If the multiplier is present in the report descriptor, set it to the logical max and then use the resolution multiplier to calculate the high-resolution events. This is the recommendation by Microsoft, see http://msdn.microsoft.com/en-us/windows/hardware/gg487477.aspx Note that all mice encountered so far have a logical min/max of 0/1, so it's a binary "yes or no" to high-res scrolling anyway. To make userspace simpler, always enable the REL_WHEEL_HI_RES bit. Where the device doesn't support high-resolution scrolling, the value for the high-res data will simply be a multiple of 120 every time. For userspace, if REL_WHEEL_HI_RES is available that is the one to be used. Potential side-effect: a device with a Resolution Multiplier applying to other Input items will have those items set to the logical max as well. This cannot easily be worked around but it is doubtful such devices exist. Signed-off-by: Peter Hutterer Verified-by: Harry Cutts Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-input.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/hid.h | 3 ++ 2 files changed, 108 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index d6fab5798487..59a5608b8dc0 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -712,7 +712,15 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel map_abs_clear(usage->hid & 0xf); break; - case HID_GD_SLIDER: case HID_GD_DIAL: case HID_GD_WHEEL: + case HID_GD_WHEEL: + if (field->flags & HID_MAIN_ITEM_RELATIVE) { + set_bit(REL_WHEEL, input->relbit); + map_rel(REL_WHEEL_HI_RES); + } else { + map_abs(usage->hid & 0xf); + } + break; + case HID_GD_SLIDER: case HID_GD_DIAL: if (field->flags & HID_MAIN_ITEM_RELATIVE) map_rel(usage->hid & 0xf); else @@ -1012,7 +1020,10 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel case 0x22f: map_key_clear(KEY_ZOOMRESET); break; case 0x233: map_key_clear(KEY_SCROLLUP); break; case 0x234: map_key_clear(KEY_SCROLLDOWN); break; - case 0x238: map_rel(REL_HWHEEL); break; + case 0x238: /* AC Pan */ + set_bit(REL_HWHEEL, input->relbit); + map_rel(REL_HWHEEL_HI_RES); + break; case 0x23d: map_key_clear(KEY_EDIT); break; case 0x25f: map_key_clear(KEY_CANCEL); break; case 0x269: map_key_clear(KEY_INSERT); break; @@ -1200,6 +1211,38 @@ ignore: } +static void hidinput_handle_scroll(struct hid_usage *usage, + struct input_dev *input, + __s32 value) +{ + int code; + int hi_res, lo_res; + + if (value == 0) + return; + + if (usage->code == REL_WHEEL_HI_RES) + code = REL_WHEEL; + else + code = REL_HWHEEL; + + /* + * Windows reports one wheel click as value 120. Where a high-res + * scroll wheel is present, a fraction of 120 is reported instead. + * Our REL_WHEEL_HI_RES axis does the same because all HW must + * adhere to the 120 expectation. + */ + hi_res = value * 120/usage->resolution_multiplier; + + usage->wheel_accumulated += hi_res; + lo_res = usage->wheel_accumulated/120; + if (lo_res) + usage->wheel_accumulated -= lo_res * 120; + + input_event(input, EV_REL, code, lo_res); + input_event(input, EV_REL, usage->code, hi_res); +} + void hidinput_hid_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct input_dev *input; @@ -1262,6 +1305,12 @@ void hidinput_hid_event(struct hid_device *hid, struct hid_field *field, struct if ((usage->type == EV_KEY) && (usage->code == 0)) /* Key 0 is "unassigned", not KEY_UNKNOWN */ return; + if ((usage->type == EV_REL) && (usage->code == REL_WHEEL_HI_RES || + usage->code == REL_HWHEEL_HI_RES)) { + hidinput_handle_scroll(usage, input, value); + return; + } + if ((usage->type == EV_ABS) && (field->flags & HID_MAIN_ITEM_RELATIVE) && (usage->code == ABS_VOLUME)) { int count = abs(value); @@ -1489,6 +1538,58 @@ static void hidinput_close(struct input_dev *dev) hid_hw_close(hid); } +static void hidinput_change_resolution_multipliers(struct hid_device *hid) +{ + struct hid_report_enum *rep_enum; + struct hid_report *rep; + struct hid_usage *usage; + int i, j; + + rep_enum = &hid->report_enum[HID_FEATURE_REPORT]; + list_for_each_entry(rep, &rep_enum->report_list, list) { + bool update_needed = false; + + if (rep->maxfield == 0) + continue; + + /* + * If we have more than one feature within this report we + * need to fill in the bits from the others before we can + * overwrite the ones for the Resolution Multiplier. + */ + if (rep->maxfield > 1) { + hid_hw_request(hid, rep, HID_REQ_GET_REPORT); + hid_hw_wait(hid); + } + + for (i = 0; i < rep->maxfield; i++) { + __s32 logical_max = rep->field[i]->logical_maximum; + + /* There is no good reason for a Resolution + * Multiplier to have a count other than 1. + * Ignore that case. + */ + if (rep->field[i]->report_count != 1) + continue; + + for (j = 0; j < rep->field[i]->maxusage; j++) { + usage = &rep->field[i]->usage[j]; + + if (usage->hid != HID_GD_RESOLUTION_MULTIPLIER) + continue; + + *rep->field[i]->value = logical_max; + update_needed = true; + } + } + if (update_needed) + hid_hw_request(hid, rep, HID_REQ_SET_REPORT); + } + + /* refresh our structs */ + hid_setup_resolution_multiplier(hid); +} + static void report_features(struct hid_device *hid) { struct hid_driver *drv = hid->driver; @@ -1782,6 +1883,8 @@ int hidinput_connect(struct hid_device *hid, unsigned int force) } } + hidinput_change_resolution_multipliers(hid); + list_for_each_entry_safe(hidinput, next, &hid->inputs, list) { if (drv->input_configured && drv->input_configured(hid, hidinput)) @@ -1840,4 +1943,3 @@ void hidinput_disconnect(struct hid_device *hid) cancel_work_sync(&hid->led_work); } EXPORT_SYMBOL_GPL(hidinput_disconnect); - diff --git a/include/linux/hid.h b/include/linux/hid.h index fd8d860365a4..93db548f8761 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -233,6 +233,7 @@ struct hid_item { #define HID_DC_BATTERYSTRENGTH 0x00060020 #define HID_CP_CONSUMER_CONTROL 0x000c0001 +#define HID_CP_AC_PAN 0x000c0238 #define HID_DG_DIGITIZER 0x000d0001 #define HID_DG_PEN 0x000d0002 @@ -441,11 +442,13 @@ struct hid_usage { __s8 resolution_multiplier;/* Effective Resolution Multiplier (HUT v1.12, 4.3.1), default: 1 */ /* hidinput data */ + __s8 wheel_factor; /* 120/resolution_multiplier */ __u16 code; /* input driver code */ __u8 type; /* input driver type */ __s8 hat_min; /* hat switch fun */ __s8 hat_max; /* ditto */ __s8 hat_dir; /* ditto */ + __s16 wheel_accumulated; /* hi-res wheel */ }; struct hid_input; -- cgit v1.2.3-71-gd317 From 356ff8a9a78fb35d6482584d260c3754dcbdf669 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 7 Dec 2018 14:50:16 -0800 Subject: Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask" This reverts commit 89c83fb539f95491be80cdd5158e6f0ce329e317. This should have been done as part of 2f0799a0ffc0 ("mm, thp: restore node-local hugepage allocations"). The movement of the thp allocation policy from alloc_pages_vma() to alloc_hugepage_direct_gfpmask() was intended to only set __GFP_THISNODE for mempolicies that are not MPOL_BIND whereas the revert could set this regardless of mempolicy. While the check for MPOL_BIND between alloc_hugepage_direct_gfpmask() and alloc_pages_vma() was racy, that has since been removed since the revert. What is left is the possibility to use __GFP_THISNODE in policy_node() when it is unexpected because the special handling for hugepages in alloc_pages_vma() was removed as part of the consolidation. Secondly, prior to 89c83fb539f9, alloc_pages_vma() implemented a somewhat different policy for hugepage allocations, which were allocated through alloc_hugepage_vma(). For hugepage allocations, if the allocating process's node is in the set of allowed nodes, allocate with __GFP_THISNODE for that node (for MPOL_PREFERRED, use that node with __GFP_THISNODE instead). This was changed for shmem_alloc_hugepage() to allow fallback to other nodes in 89c83fb539f9 as it did for new_page() in mm/mempolicy.c which is functionally different behavior and removes the requirement to only allocate hugepages locally. So this commit does a full revert of 89c83fb539f9 instead of the partial revert that was done in 2f0799a0ffc0. The result is the same thp allocation policy for 4.20 that was in 4.19. Fixes: 89c83fb539f9 ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask") Fixes: 2f0799a0ffc0 ("mm, thp: restore node-local hugepage allocations") Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Michal Hocko Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++++++++---- mm/huge_memory.c | 27 +++++++++++++-------------- mm/mempolicy.c | 32 +++++++++++++++++++++++++++++--- mm/shmem.c | 2 +- 4 files changed, 51 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 76f8db0b0e71..0705164f928c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -510,18 +510,22 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node); + int node, bool hugepage); +#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ + alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ + alloc_pages(gfp_mask, order) +#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, node) + alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f2d19e4fe854..5da55b38b1b7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -629,30 +629,30 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE; /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return GFP_TRANSHUGE | __GFP_THISNODE | - (vma_madvised ? 0 : __GFP_NORETRY); + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return gfp_mask | __GFP_KSWAPD_RECLAIM; + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : - __GFP_KSWAPD_RECLAIM); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); - return gfp_mask; + return GFP_TRANSHUGE_LIGHT; } /* Caller must hold page table lock. */ @@ -724,8 +724,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma, haddr); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); + gfp = alloc_hugepage_direct_gfpmask(vma); + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1295,9 +1295,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); - new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, - haddr, numa_node_id()); + huge_gfp = alloc_hugepage_direct_gfpmask(vma); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 69e278b469ef..d4496d9d34f5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start) } else if (PageTransHuge(page)) { struct page *thp; - thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, - address, numa_node_id()); + thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, + HPAGE_PMD_ORDER); if (!thp) return NULL; prep_transhuge_page(thp); @@ -2011,6 +2011,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). + * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. @@ -2021,7 +2022,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node) + unsigned long addr, int node, bool hugepage) { struct mempolicy *pol; struct page *page; @@ -2039,6 +2040,31 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { + int hpage_node = node; + + /* + * For hugepage allocation and non-interleave policy which + * allows the current node (or other explicitly preferred + * node) we only try to allocate from the current/preferred + * node and don't fall back to other nodes, as the cost of + * remote accesses would likely offset THP benefits. + * + * If the policy is interleave, or does not allow the current + * node in its nodemask, we allocate the standard way. + */ + if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) + hpage_node = pol->v.preferred_node; + + nmask = policy_nodemask(gfp, pol); + if (!nmask || node_isset(hpage_node, *nmask)) { + mpol_cond_put(pol); + page = __alloc_pages_node(hpage_node, + gfp | __GFP_THISNODE, order); + goto out; + } + } + nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); diff --git a/mm/shmem.c b/mm/shmem.c index cddc72ac44d8..921f80488bb3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1439,7 +1439,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id()); + HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); -- cgit v1.2.3-71-gd317