From a7c9c25a99bbdaff51da26b874d2faaa8fdd72b5 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 23 Feb 2021 21:14:57 +0800
Subject: bpf: Remove blank line in bpf helper description comment

Commit 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") added an extra
blank line in bpf helper description. This will make bpf_helpers_doc.py stop
building bpf_helper_defs.h immediately after bpf_check_mtu(), which will
affect future added functions.

Fixes: 34b2021cc616 ("bpf: Add BPF-helper for MTU checking")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20210223131457.1378978-1-liuhangbin@gmail.com
---
 include/uapi/linux/bpf.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4c24daa43bac..79c893310492 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3850,7 +3850,6 @@ union bpf_attr {
  *
  * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags)
  *	Description
-
  *		Check ctx packet size against exceeding MTU of net device (based
  *		on *ifindex*).  This helper will likely be used in combination
  *		with helpers that adjust/change the packet size.
-- 
cgit v1.2.3-71-gd317


From a83586a7ddba25065ec37323c05deb9019ce4fa9 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 23 Feb 2021 21:14:57 +0800
Subject: bpf: Remove blank line in bpf helper description comment

Commit 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") added an extra
blank line in bpf helper description. This will make bpf_helpers_doc.py stop
building bpf_helper_defs.h immediately after bpf_check_mtu(), which will
affect future added functions.

Fixes: 34b2021cc616 ("bpf: Add BPF-helper for MTU checking")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20210223131457.1378978-1-liuhangbin@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 1 -
 tools/include/uapi/linux/bpf.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4c24daa43bac..79c893310492 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3850,7 +3850,6 @@ union bpf_attr {
  *
  * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags)
  *	Description
-
  *		Check ctx packet size against exceeding MTU of net device (based
  *		on *ifindex*).  This helper will likely be used in combination
  *		with helpers that adjust/change the packet size.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4c24daa43bac..79c893310492 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3850,7 +3850,6 @@ union bpf_attr {
  *
  * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags)
  *	Description
-
  *		Check ctx packet size against exceeding MTU of net device (based
  *		on *ifindex*).  This helper will likely be used in combination
  *		with helpers that adjust/change the packet size.
-- 
cgit v1.2.3-71-gd317


From 69c087ba6225b574afb6e505b72cb75242a3d844 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 26 Feb 2021 12:49:25 -0800
Subject: bpf: Add bpf_for_each_map_elem() helper

The bpf_for_each_map_elem() helper is introduced which
iterates all map elements with a callback function. The
helper signature looks like
  long bpf_for_each_map_elem(map, callback_fn, callback_ctx, flags)
and for each map element, the callback_fn will be called. For example,
like hashmap, the callback signature may look like
  long callback_fn(map, key, val, callback_ctx)

There are two known use cases for this. One is from upstream ([1]) where
a for_each_map_elem helper may help implement a timeout mechanism
in a more generic way. Another is from our internal discussion
for a firewall use case where a map contains all the rules. The packet
data can be compared to all these rules to decide allow or deny
the packet.

For array maps, users can already use a bounded loop to traverse
elements. Using this helper can avoid using bounded loop. For other
type of maps (e.g., hash maps) where bounded loop is hard or
impossible to use, this helper provides a convenient way to
operate on all elements.

For callback_fn, besides map and map element, a callback_ctx,
allocated on caller stack, is also passed to the callback
function. This callback_ctx argument can provide additional
input and allow to write to caller stack for output.

If the callback_fn returns 0, the helper will iterate through next
element if available. If the callback_fn returns 1, the helper
will stop iterating and returns to the bpf program. Other return
values are not used for now.

Currently, this helper is only available with jit. It is possible
to make it work with interpreter with so effort but I leave it
as the future work.

[1]: https://lore.kernel.org/bpf/20210122205415.113822-1-xiyou.wangcong@gmail.com/

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210226204925.3884923-1-yhs@fb.com
---
 include/linux/bpf.h            |  13 +++
 include/linux/bpf_verifier.h   |   3 +
 include/uapi/linux/bpf.h       |  38 ++++++++
 kernel/bpf/bpf_iter.c          |  16 ++++
 kernel/bpf/helpers.c           |   2 +
 kernel/bpf/verifier.c          | 208 ++++++++++++++++++++++++++++++++++++++---
 kernel/trace/bpf_trace.c       |   2 +
 tools/include/uapi/linux/bpf.h |  38 ++++++++
 8 files changed, 307 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1e4d2f60527..aeb1b93a4d75 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@ struct bpf_local_storage;
 struct bpf_local_storage_map;
 struct kobject;
 struct mem_cgroup;
+struct bpf_func_state;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -129,6 +130,13 @@ struct bpf_map_ops {
 	bool (*map_meta_equal)(const struct bpf_map *meta0,
 			       const struct bpf_map *meta1);
 
+
+	int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
+					      struct bpf_func_state *caller,
+					      struct bpf_func_state *callee);
+	int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn,
+				     void *callback_ctx, u64 flags);
+
 	/* BTF name and id of struct allocated by map_alloc */
 	const char * const map_btf_name;
 	int *map_btf_id;
@@ -295,6 +303,8 @@ enum bpf_arg_type {
 	ARG_CONST_ALLOC_SIZE_OR_ZERO,	/* number of allocated bytes requested */
 	ARG_PTR_TO_BTF_ID_SOCK_COMMON,	/* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
 	ARG_PTR_TO_PERCPU_BTF_ID,	/* pointer to in-kernel percpu type */
+	ARG_PTR_TO_FUNC,	/* pointer to a bpf program function */
+	ARG_PTR_TO_STACK_OR_NULL,	/* pointer to stack or NULL */
 	__BPF_ARG_TYPE_MAX,
 };
 
@@ -411,6 +421,8 @@ enum bpf_reg_type {
 	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */
 	PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
+	PTR_TO_FUNC,		 /* reg points to a bpf program function */
+	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -1887,6 +1899,7 @@ extern const struct bpf_func_proto bpf_sock_from_file_proto;
 extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
 extern const struct bpf_func_proto bpf_task_storage_get_proto;
 extern const struct bpf_func_proto bpf_task_storage_delete_proto;
+extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 971b33aca13d..51c2ffa3d901 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -68,6 +68,8 @@ struct bpf_reg_state {
 			unsigned long raw1;
 			unsigned long raw2;
 		} raw;
+
+		u32 subprogno; /* for PTR_TO_FUNC */
 	};
 	/* For PTR_TO_PACKET, used to find other pointers with the same variable
 	 * offset, so they can share range knowledge.
@@ -204,6 +206,7 @@ struct bpf_func_state {
 	int acquired_refs;
 	struct bpf_reference_state *refs;
 	int allocated_stack;
+	bool in_callback_fn;
 	struct bpf_stack_state *stack;
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 79c893310492..b89af20cfa19 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -393,6 +393,15 @@ enum bpf_link_type {
  *                   is struct/union.
  */
 #define BPF_PSEUDO_BTF_ID	3
+/* insn[0].src_reg:  BPF_PSEUDO_FUNC
+ * insn[0].imm:      insn offset to the func
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the function
+ * verifier type:    PTR_TO_FUNC.
+ */
+#define BPF_PSEUDO_FUNC		4
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function
@@ -3909,6 +3918,34 @@ union bpf_attr {
  *		* **BPF_MTU_CHK_RET_FRAG_NEEDED**
  *		* **BPF_MTU_CHK_RET_SEGS_TOOBIG**
  *
+ * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ *	Description
+ *		For each element in **map**, call **callback_fn** function with
+ *		**map**, **callback_ctx** and other map-specific parameters.
+ *		The **callback_fn** should be a static function and
+ *		the **callback_ctx** should be a pointer to the stack.
+ *		The **flags** is used to control certain aspects of the helper.
+ *		Currently, the **flags** must be 0.
+ *
+ *		The following are a list of supported map types and their
+ *		respective expected callback signatures:
+ *
+ *		BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ *		BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ *		BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ *
+ *		long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ *
+ *		For per_cpu maps, the map_value is the value on the cpu where the
+ *		bpf_prog is running.
+ *
+ *		If **callback_fn** return 0, the helper will continue to the next
+ *		element. If return value is 1, the helper will skip the rest of
+ *		elements and return. Other return values are not used now.
+ *
+ *	Return
+ *		The number of traversed map elements for success, **-EINVAL** for
+ *		invalid **flags**.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4075,6 +4112,7 @@ union bpf_attr {
 	FN(ima_inode_hash),		\
 	FN(sock_from_file),		\
 	FN(check_mtu),			\
+	FN(for_each_map_elem),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index a0d9eade9c80..931870f9cf56 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
 	 */
 	return ret == 0 ? 0 : -EAGAIN;
 }
+
+BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
+	   void *, callback_ctx, u64, flags)
+{
+	return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
+}
+
+const struct bpf_func_proto bpf_for_each_map_elem_proto = {
+	.func		= bpf_for_each_map_elem,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 308427fe03a3..074800226327 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ringbuf_discard_proto;
 	case BPF_FUNC_ringbuf_query:
 		return &bpf_ringbuf_query_proto;
+	case BPF_FUNC_for_each_map_elem:
+		return &bpf_for_each_map_elem_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dbdca49ac6cc..53afe9461b03 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn)
 	       insn->src_reg == BPF_PSEUDO_CALL;
 }
 
+static bool bpf_pseudo_func(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
+	       insn->src_reg == BPF_PSEUDO_FUNC;
+}
+
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
 	bool raw_mode;
@@ -248,6 +254,7 @@ struct bpf_call_arg_meta {
 	u32 btf_id;
 	struct btf *ret_btf;
 	u32 ret_btf_id;
+	u32 subprogno;
 };
 
 struct btf *btf_vmlinux;
@@ -427,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type)
 	return type == PTR_TO_SOCKET ||
 		type == PTR_TO_TCP_SOCK ||
 		type == PTR_TO_MAP_VALUE ||
+		type == PTR_TO_MAP_KEY ||
 		type == PTR_TO_SOCK_COMMON;
 }
 
@@ -469,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type)
 	       type == ARG_PTR_TO_MEM_OR_NULL ||
 	       type == ARG_PTR_TO_CTX_OR_NULL ||
 	       type == ARG_PTR_TO_SOCKET_OR_NULL ||
-	       type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+	       type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
+	       type == ARG_PTR_TO_STACK_OR_NULL;
 }
 
 /* Determine whether the function releases some resources allocated by another
@@ -552,6 +561,8 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
 	[PTR_TO_RDWR_BUF]	= "rdwr_buf",
 	[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
+	[PTR_TO_FUNC]		= "func",
+	[PTR_TO_MAP_KEY]	= "map_key",
 };
 
 static char slot_type_char[] = {
@@ -623,6 +634,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			if (type_is_pkt_pointer(t))
 				verbose(env, ",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
+				 t == PTR_TO_MAP_KEY ||
 				 t == PTR_TO_MAP_VALUE ||
 				 t == PTR_TO_MAP_VALUE_OR_NULL)
 				verbose(env, ",ks=%d,vs=%d",
@@ -1555,6 +1567,19 @@ static int check_subprogs(struct bpf_verifier_env *env)
 
 	/* determine subprog starts. The end is one before the next starts */
 	for (i = 0; i < insn_cnt; i++) {
+		if (bpf_pseudo_func(insn + i)) {
+			if (!env->bpf_capable) {
+				verbose(env,
+					"function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+				return -EPERM;
+			}
+			ret = add_subprog(env, i + insn[i].imm + 1);
+			if (ret < 0)
+				return ret;
+			/* remember subprog */
+			insn[i + 1].imm = ret;
+			continue;
+		}
 		if (!bpf_pseudo_call(insn + i))
 			continue;
 		if (!env->bpf_capable) {
@@ -2286,6 +2311,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_PERCPU_BTF_ID:
 	case PTR_TO_MEM:
 	case PTR_TO_MEM_OR_NULL:
+	case PTR_TO_FUNC:
+	case PTR_TO_MAP_KEY:
 		return true;
 	default:
 		return false;
@@ -2890,6 +2917,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
 
 	reg = &cur_regs(env)[regno];
 	switch (reg->type) {
+	case PTR_TO_MAP_KEY:
+		verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
+			mem_size, off, size);
+		break;
 	case PTR_TO_MAP_VALUE:
 		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
 			mem_size, off, size);
@@ -3295,6 +3326,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_FLOW_KEYS:
 		pointer_desc = "flow keys ";
 		break;
+	case PTR_TO_MAP_KEY:
+		pointer_desc = "key ";
+		break;
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
 		break;
@@ -3396,7 +3430,7 @@ process_func:
 continue_func:
 	subprog_end = subprog[idx + 1].start;
 	for (; i < subprog_end; i++) {
-		if (!bpf_pseudo_call(insn + i))
+		if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
 			continue;
 		/* remember insn and function to return to */
 		ret_insn[frame] = i + 1;
@@ -3833,7 +3867,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	/* for access checks, reg->off is just part of off */
 	off += reg->off;
 
-	if (reg->type == PTR_TO_MAP_VALUE) {
+	if (reg->type == PTR_TO_MAP_KEY) {
+		if (t == BPF_WRITE) {
+			verbose(env, "write to change key R%d not allowed\n", regno);
+			return -EACCES;
+		}
+
+		err = check_mem_region_access(env, regno, off, size,
+					      reg->map_ptr->key_size, false);
+		if (err)
+			return err;
+		if (value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -4249,6 +4295,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size,
 					   zero_size_allowed);
+	case PTR_TO_MAP_KEY:
+		return check_mem_region_access(env, regno, reg->off, access_size,
+					       reg->map_ptr->key_size, false);
 	case PTR_TO_MAP_VALUE:
 		if (check_map_access_type(env, regno, reg->off, access_size,
 					  meta && meta->raw_mode ? BPF_WRITE :
@@ -4465,6 +4514,7 @@ static const struct bpf_reg_types map_key_value_types = {
 		PTR_TO_STACK,
 		PTR_TO_PACKET,
 		PTR_TO_PACKET_META,
+		PTR_TO_MAP_KEY,
 		PTR_TO_MAP_VALUE,
 	},
 };
@@ -4496,6 +4546,7 @@ static const struct bpf_reg_types mem_types = {
 		PTR_TO_STACK,
 		PTR_TO_PACKET,
 		PTR_TO_PACKET_META,
+		PTR_TO_MAP_KEY,
 		PTR_TO_MAP_VALUE,
 		PTR_TO_MEM,
 		PTR_TO_RDONLY_BUF,
@@ -4508,6 +4559,7 @@ static const struct bpf_reg_types int_ptr_types = {
 		PTR_TO_STACK,
 		PTR_TO_PACKET,
 		PTR_TO_PACKET_META,
+		PTR_TO_MAP_KEY,
 		PTR_TO_MAP_VALUE,
 	},
 };
@@ -4520,6 +4572,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T
 static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
 static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
+static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -4548,6 +4602,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_INT]		= &int_ptr_types,
 	[ARG_PTR_TO_LONG]		= &int_ptr_types,
 	[ARG_PTR_TO_PERCPU_BTF_ID]	= &percpu_btf_ptr_types,
+	[ARG_PTR_TO_FUNC]		= &func_ptr_types,
+	[ARG_PTR_TO_STACK_OR_NULL]	= &stack_ptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4729,6 +4785,8 @@ skip_type_check:
 			verbose(env, "verifier internal error\n");
 			return -EFAULT;
 		}
+	} else if (arg_type == ARG_PTR_TO_FUNC) {
+		meta->subprogno = reg->subprogno;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
 		/* The access to this pointer is only checked when we hit the
 		 * next is_mem_size argument below.
@@ -5375,6 +5433,35 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
 }
 
+static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+				       struct bpf_func_state *caller,
+				       struct bpf_func_state *callee,
+				       int insn_idx)
+{
+	struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+	struct bpf_map *map;
+	int err;
+
+	if (bpf_map_ptr_poisoned(insn_aux)) {
+		verbose(env, "tail_call abusing map_ptr\n");
+		return -EINVAL;
+	}
+
+	map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+	if (!map->ops->map_set_for_each_callback_args ||
+	    !map->ops->map_for_each_callback) {
+		verbose(env, "callback function not allowed for map\n");
+		return -ENOTSUPP;
+	}
+
+	err = map->ops->map_set_for_each_callback_args(env, caller, callee);
+	if (err)
+		return err;
+
+	callee->in_callback_fn = true;
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -5397,8 +5484,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 
 	state->curframe--;
 	caller = state->frame[state->curframe];
-	/* return to the caller whatever r0 had in the callee */
-	caller->regs[BPF_REG_0] = *r0;
+	if (callee->in_callback_fn) {
+		/* enforce R0 return value range [0, 1]. */
+		struct tnum range = tnum_range(0, 1);
+
+		if (r0->type != SCALAR_VALUE) {
+			verbose(env, "R0 not a scalar value\n");
+			return -EACCES;
+		}
+		if (!tnum_in(range, r0->var_off)) {
+			verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+			return -EINVAL;
+		}
+	} else {
+		/* return to the caller whatever r0 had in the callee */
+		caller->regs[BPF_REG_0] = *r0;
+	}
 
 	/* Transfer references to the caller */
 	err = transfer_reference_state(caller, callee);
@@ -5453,7 +5554,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	    func_id != BPF_FUNC_map_delete_elem &&
 	    func_id != BPF_FUNC_map_push_elem &&
 	    func_id != BPF_FUNC_map_pop_elem &&
-	    func_id != BPF_FUNC_map_peek_elem)
+	    func_id != BPF_FUNC_map_peek_elem &&
+	    func_id != BPF_FUNC_for_each_map_elem)
 		return 0;
 
 	if (map == NULL) {
@@ -5534,15 +5636,18 @@ static int check_reference_leak(struct bpf_verifier_env *env)
 	return state->acquired_refs ? -EINVAL : 0;
 }
 
-static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			     int *insn_idx_p)
 {
 	const struct bpf_func_proto *fn = NULL;
 	struct bpf_reg_state *regs;
 	struct bpf_call_arg_meta meta;
+	int insn_idx = *insn_idx_p;
 	bool changes_data;
-	int i, err;
+	int i, err, func_id;
 
 	/* find function prototype */
+	func_id = insn->imm;
 	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
 		verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
 			func_id);
@@ -5638,6 +5743,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_for_each_map_elem) {
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_map_elem_callback_state);
+		if (err < 0)
+			return -EINVAL;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
@@ -5891,6 +6003,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
 		else
 			*ptr_limit = -off;
 		return 0;
+	case PTR_TO_MAP_KEY:
+		/* Currently, this code is not exercised as the only use
+		 * is bpf_for_each_map_elem() helper which requires
+		 * bpf_capble. The code has been tested manually for
+		 * future use.
+		 */
+		if (mask_to_left) {
+			*ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+		} else {
+			off = ptr_reg->smin_value + ptr_reg->off;
+			*ptr_limit = ptr_reg->map_ptr->key_size - off;
+		}
+		return 0;
 	case PTR_TO_MAP_VALUE:
 		if (mask_to_left) {
 			*ptr_limit = ptr_reg->umax_value + ptr_reg->off;
@@ -6092,6 +6217,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
+	case PTR_TO_MAP_KEY:
 	case PTR_TO_MAP_VALUE:
 		if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
 			verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
@@ -8271,6 +8397,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return 0;
 	}
 
+	if (insn->src_reg == BPF_PSEUDO_FUNC) {
+		struct bpf_prog_aux *aux = env->prog->aux;
+		u32 subprogno = insn[1].imm;
+
+		if (!aux->func_info) {
+			verbose(env, "missing btf func_info\n");
+			return -EINVAL;
+		}
+		if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
+			verbose(env, "callback function not static\n");
+			return -EINVAL;
+		}
+
+		dst_reg->type = PTR_TO_FUNC;
+		dst_reg->subprogno = subprogno;
+		return 0;
+	}
+
 	map = env->used_maps[aux->map_index];
 	mark_reg_known_zero(env, regs, insn->dst_reg);
 	dst_reg->map_ptr = map;
@@ -8657,6 +8801,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
 	struct bpf_insn *insns = env->prog->insnsi;
 	int ret;
 
+	if (bpf_pseudo_func(insns + t))
+		return visit_func_call_insn(t, insn_cnt, insns, env, true);
+
 	/* All non-branch instructions have a single fall-through edge. */
 	if (BPF_CLASS(insns[t].code) != BPF_JMP &&
 	    BPF_CLASS(insns[t].code) != BPF_JMP32)
@@ -9277,6 +9424,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			 */
 			return false;
 		}
+	case PTR_TO_MAP_KEY:
 	case PTR_TO_MAP_VALUE:
 		/* If the new min/max/var_off satisfy the old ones and
 		 * everything else matches, we are OK.
@@ -10123,10 +10271,9 @@ static int do_check(struct bpf_verifier_env *env)
 				if (insn->src_reg == BPF_PSEUDO_CALL)
 					err = check_func_call(env, insn, &env->insn_idx);
 				else
-					err = check_helper_call(env, insn->imm, env->insn_idx);
+					err = check_helper_call(env, insn, &env->insn_idx);
 				if (err)
 					return err;
-
 			} else if (opcode == BPF_JA) {
 				if (BPF_SRC(insn->code) != BPF_K ||
 				    insn->imm != 0 ||
@@ -10555,6 +10702,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 				goto next_insn;
 			}
 
+			if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
+				aux = &env->insn_aux_data[i];
+				aux->ptr_type = PTR_TO_FUNC;
+				goto next_insn;
+			}
+
 			/* In final convert_pseudo_ld_imm64() step, this is
 			 * converted into regular 64-bit imm load insn.
 			 */
@@ -10687,9 +10840,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
 	int insn_cnt = env->prog->len;
 	int i;
 
-	for (i = 0; i < insn_cnt; i++, insn++)
-		if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
-			insn->src_reg = 0;
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
+			continue;
+		if (insn->src_reg == BPF_PSEUDO_FUNC)
+			continue;
+		insn->src_reg = 0;
+	}
 }
 
 /* single env->prog->insni[off] instruction was replaced with the range
@@ -11330,6 +11487,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		return 0;
 
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+		if (bpf_pseudo_func(insn)) {
+			env->insn_aux_data[i].call_imm = insn->imm;
+			/* subprog is encoded in insn[1].imm */
+			continue;
+		}
+
 		if (!bpf_pseudo_call(insn))
 			continue;
 		/* Upon error here we cannot fall back to interpreter but
@@ -11459,6 +11622,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	for (i = 0; i < env->subprog_cnt; i++) {
 		insn = func[i]->insnsi;
 		for (j = 0; j < func[i]->len; j++, insn++) {
+			if (bpf_pseudo_func(insn)) {
+				subprog = insn[1].imm;
+				insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+				insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+				continue;
+			}
 			if (!bpf_pseudo_call(insn))
 				continue;
 			subprog = insn->off;
@@ -11504,6 +11673,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * later look the same as if they were interpreted only.
 	 */
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+		if (bpf_pseudo_func(insn)) {
+			insn[0].imm = env->insn_aux_data[i].call_imm;
+			insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+			continue;
+		}
 		if (!bpf_pseudo_call(insn))
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
@@ -11568,6 +11742,14 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 		return -EINVAL;
 	}
 	for (i = 0; i < prog->len; i++, insn++) {
+		if (bpf_pseudo_func(insn)) {
+			/* When JIT fails the progs with callback calls
+			 * have to be rejected, since interpreter doesn't support them yet.
+			 */
+			verbose(env, "callbacks are not allowed in non-JITed programs\n");
+			return -EINVAL;
+		}
+
 		if (!bpf_pseudo_call(insn))
 			continue;
 		depth = get_callee_stack_depth(env, insn, i);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e9701744d8e4..0d23755c2747 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1371,6 +1371,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_task_storage_get_proto;
 	case BPF_FUNC_task_storage_delete:
 		return &bpf_task_storage_delete_proto;
+	case BPF_FUNC_for_each_map_elem:
+		return &bpf_for_each_map_elem_proto;
 	default:
 		return NULL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 79c893310492..b89af20cfa19 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -393,6 +393,15 @@ enum bpf_link_type {
  *                   is struct/union.
  */
 #define BPF_PSEUDO_BTF_ID	3
+/* insn[0].src_reg:  BPF_PSEUDO_FUNC
+ * insn[0].imm:      insn offset to the func
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the function
+ * verifier type:    PTR_TO_FUNC.
+ */
+#define BPF_PSEUDO_FUNC		4
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function
@@ -3909,6 +3918,34 @@ union bpf_attr {
  *		* **BPF_MTU_CHK_RET_FRAG_NEEDED**
  *		* **BPF_MTU_CHK_RET_SEGS_TOOBIG**
  *
+ * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ *	Description
+ *		For each element in **map**, call **callback_fn** function with
+ *		**map**, **callback_ctx** and other map-specific parameters.
+ *		The **callback_fn** should be a static function and
+ *		the **callback_ctx** should be a pointer to the stack.
+ *		The **flags** is used to control certain aspects of the helper.
+ *		Currently, the **flags** must be 0.
+ *
+ *		The following are a list of supported map types and their
+ *		respective expected callback signatures:
+ *
+ *		BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ *		BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ *		BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ *
+ *		long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ *
+ *		For per_cpu maps, the map_value is the value on the cpu where the
+ *		bpf_prog is running.
+ *
+ *		If **callback_fn** return 0, the helper will continue to the next
+ *		element. If return value is 1, the helper will skip the rest of
+ *		elements and return. Other return values are not used now.
+ *
+ *	Return
+ *		The number of traversed map elements for success, **-EINVAL** for
+ *		invalid **flags**.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4075,6 +4112,7 @@ union bpf_attr {
 	FN(ima_inode_hash),		\
 	FN(sock_from_file),		\
 	FN(check_mtu),			\
+	FN(for_each_map_elem),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3-71-gd317


From c33cb0020ee6dd96cc9976d6085a7d8422f6dbed Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Mon, 22 Feb 2021 08:00:00 +0000
Subject: uapi: nfnetlink_cthelper.h: fix userspace compilation error

Apparently, <linux/netfilter/nfnetlink_cthelper.h> and
<linux/netfilter/nfnetlink_acct.h> could not be included into the same
compilation unit because of a cut-and-paste typo in the former header.

Fixes: 12f7a505331e6 ("netfilter: add user-space connection tracking helper infrastructure")
Cc: <stable@vger.kernel.org> # v3.6
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_cthelper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nfnetlink_cthelper.h b/include/uapi/linux/netfilter/nfnetlink_cthelper.h
index a13137afc429..70af02092d16 100644
--- a/include/uapi/linux/netfilter/nfnetlink_cthelper.h
+++ b/include/uapi/linux/netfilter/nfnetlink_cthelper.h
@@ -5,7 +5,7 @@
 #define NFCT_HELPER_STATUS_DISABLED	0
 #define NFCT_HELPER_STATUS_ENABLED	1
 
-enum nfnl_acct_msg_types {
+enum nfnl_cthelper_msg_types {
 	NFNL_MSG_CTHELPER_NEW,
 	NFNL_MSG_CTHELPER_GET,
 	NFNL_MSG_CTHELPER_DEL,
-- 
cgit v1.2.3-71-gd317


From 3e59e8856758eb5a2dfe1f831ef53b168fd58105 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Wed, 3 Mar 2021 16:50:49 +0100
Subject: net: l2tp: reduce log level of messages in receive path, add counter
 instead

Commit 5ee759cda51b ("l2tp: use standard API for warning log messages")
changed a number of warnings about invalid packets in the receive path
so that they are always shown, instead of only when a special L2TP debug
flag is set. Even with rate limiting these warnings can easily cause
significant log spam - potentially triggered by a malicious party
sending invalid packets on purpose.

In addition these warnings were noticed by projects like Tunneldigger [1],
which uses L2TP for its data path, but implements its own control
protocol (which is sufficiently different from L2TP data packets that it
would always be passed up to userspace even with future extensions of
L2TP).

Some of the warnings were already redundant, as l2tp_stats has a counter
for these packets. This commit adds one additional counter for invalid
packets that are passed up to userspace. Packets with unknown session are
not counted as invalid, as there is nothing wrong with the format of
these packets.

With the additional counter, all of these messages are either redundant
or benign, so we reduce them to pr_debug_ratelimited().

[1] https://github.com/wlanslovenija/tunneldigger/issues/160

Fixes: 5ee759cda51b ("l2tp: use standard API for warning log messages")
Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h |  1 +
 net/l2tp/l2tp_core.c      | 41 ++++++++++++++++++++++-------------------
 net/l2tp/l2tp_core.h      |  1 +
 net/l2tp/l2tp_netlink.c   |  6 ++++++
 4 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index 30c80d5ba4bf..bab8c9708611 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -145,6 +145,7 @@ enum {
 	L2TP_ATTR_RX_ERRORS,		/* u64 */
 	L2TP_ATTR_STATS_PAD,
 	L2TP_ATTR_RX_COOKIE_DISCARDS,	/* u64 */
+	L2TP_ATTR_RX_INVALID,		/* u64 */
 	__L2TP_ATTR_STATS_MAX,
 };
 
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 7be5103ff2a8..203890e378cb 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -649,9 +649,9 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 	/* Parse and check optional cookie */
 	if (session->peer_cookie_len > 0) {
 		if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) {
-			pr_warn_ratelimited("%s: cookie mismatch (%u/%u). Discarding.\n",
-					    tunnel->name, tunnel->tunnel_id,
-					    session->session_id);
+			pr_debug_ratelimited("%s: cookie mismatch (%u/%u). Discarding.\n",
+					     tunnel->name, tunnel->tunnel_id,
+					     session->session_id);
 			atomic_long_inc(&session->stats.rx_cookie_discards);
 			goto discard;
 		}
@@ -702,8 +702,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 		 * If user has configured mandatory sequence numbers, discard.
 		 */
 		if (session->recv_seq) {
-			pr_warn_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n",
-					    session->name);
+			pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n",
+					     session->name);
 			atomic_long_inc(&session->stats.rx_seq_discards);
 			goto discard;
 		}
@@ -718,8 +718,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
 			session->send_seq = 0;
 			l2tp_session_set_header_len(session, tunnel->version);
 		} else if (session->send_seq) {
-			pr_warn_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n",
-					    session->name);
+			pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n",
+					     session->name);
 			atomic_long_inc(&session->stats.rx_seq_discards);
 			goto discard;
 		}
@@ -809,9 +809,9 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
 
 	/* Short packet? */
 	if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX)) {
-		pr_warn_ratelimited("%s: recv short packet (len=%d)\n",
-				    tunnel->name, skb->len);
-		goto error;
+		pr_debug_ratelimited("%s: recv short packet (len=%d)\n",
+				     tunnel->name, skb->len);
+		goto invalid;
 	}
 
 	/* Point to L2TP header */
@@ -824,9 +824,9 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
 	/* Check protocol version */
 	version = hdrflags & L2TP_HDR_VER_MASK;
 	if (version != tunnel->version) {
-		pr_warn_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n",
-				    tunnel->name, version, tunnel->version);
-		goto error;
+		pr_debug_ratelimited("%s: recv protocol version mismatch: got %d expected %d\n",
+				     tunnel->name, version, tunnel->version);
+		goto invalid;
 	}
 
 	/* Get length of L2TP packet */
@@ -834,7 +834,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
 
 	/* If type is control packet, it is handled by userspace. */
 	if (hdrflags & L2TP_HDRFLAG_T)
-		goto error;
+		goto pass;
 
 	/* Skip flags */
 	ptr += 2;
@@ -863,21 +863,24 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
 			l2tp_session_dec_refcount(session);
 
 		/* Not found? Pass to userspace to deal with */
-		pr_warn_ratelimited("%s: no session found (%u/%u). Passing up.\n",
-				    tunnel->name, tunnel_id, session_id);
-		goto error;
+		pr_debug_ratelimited("%s: no session found (%u/%u). Passing up.\n",
+				     tunnel->name, tunnel_id, session_id);
+		goto pass;
 	}
 
 	if (tunnel->version == L2TP_HDR_VER_3 &&
 	    l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
-		goto error;
+		goto invalid;
 
 	l2tp_recv_common(session, skb, ptr, optr, hdrflags, length);
 	l2tp_session_dec_refcount(session);
 
 	return 0;
 
-error:
+invalid:
+	atomic_long_inc(&tunnel->stats.rx_invalid);
+
+pass:
 	/* Put UDP header back */
 	__skb_push(skb, sizeof(struct udphdr));
 
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index cb21d906343e..98ea98eb9567 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -39,6 +39,7 @@ struct l2tp_stats {
 	atomic_long_t		rx_oos_packets;
 	atomic_long_t		rx_errors;
 	atomic_long_t		rx_cookie_discards;
+	atomic_long_t		rx_invalid;
 };
 
 struct l2tp_tunnel;
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 83956c9ee1fc..96eb91be9238 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -428,6 +428,9 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
 			      L2TP_ATTR_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS,
 			      atomic_long_read(&tunnel->stats.rx_errors),
+			      L2TP_ATTR_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID,
+			      atomic_long_read(&tunnel->stats.rx_invalid),
 			      L2TP_ATTR_STATS_PAD))
 		goto nla_put_failure;
 	nla_nest_end(skb, nest);
@@ -771,6 +774,9 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
 			      L2TP_ATTR_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS,
 			      atomic_long_read(&session->stats.rx_errors),
+			      L2TP_ATTR_STATS_PAD) ||
+	    nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID,
+			      atomic_long_read(&session->stats.rx_invalid),
 			      L2TP_ATTR_STATS_PAD))
 		goto nla_put_failure;
 	nla_nest_end(skb, nest);
-- 
cgit v1.2.3-71-gd317


From 8fd886911a6a99acf4a8facf619a2e7b5225be78 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 26 Feb 2021 21:22:47 +0100
Subject: bpf: Add BTF_KIND_FLOAT to uapi

Add a new kind value and expand the kind bitfield.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210226202256.116518-2-iii@linux.ibm.com
---
 include/uapi/linux/btf.h       | 5 +++--
 tools/include/uapi/linux/btf.h | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 5a667107ad2c..d27b1708efe9 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -52,7 +52,7 @@ struct btf_type {
 	};
 };
 
-#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x0f)
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
 #define BTF_INFO_VLEN(info)	((info) & 0xffff)
 #define BTF_INFO_KFLAG(info)	((info) >> 31)
 
@@ -72,7 +72,8 @@ struct btf_type {
 #define BTF_KIND_FUNC_PROTO	13	/* Function Proto	*/
 #define BTF_KIND_VAR		14	/* Variable	*/
 #define BTF_KIND_DATASEC	15	/* Section	*/
-#define BTF_KIND_MAX		BTF_KIND_DATASEC
+#define BTF_KIND_FLOAT		16	/* Floating point	*/
+#define BTF_KIND_MAX		BTF_KIND_FLOAT
 #define NR_BTF_KINDS		(BTF_KIND_MAX + 1)
 
 /* For some specific BTF_KIND, "struct btf_type" is immediately
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 5a667107ad2c..d27b1708efe9 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -52,7 +52,7 @@ struct btf_type {
 	};
 };
 
-#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x0f)
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
 #define BTF_INFO_VLEN(info)	((info) & 0xffff)
 #define BTF_INFO_KFLAG(info)	((info) >> 31)
 
@@ -72,7 +72,8 @@ struct btf_type {
 #define BTF_KIND_FUNC_PROTO	13	/* Function Proto	*/
 #define BTF_KIND_VAR		14	/* Variable	*/
 #define BTF_KIND_DATASEC	15	/* Section	*/
-#define BTF_KIND_MAX		BTF_KIND_DATASEC
+#define BTF_KIND_FLOAT		16	/* Floating point	*/
+#define BTF_KIND_MAX		BTF_KIND_FLOAT
 #define NR_BTF_KINDS		(BTF_KIND_MAX + 1)
 
 /* For some specific BTF_KIND, "struct btf_type" is immediately
-- 
cgit v1.2.3-71-gd317


From 7799e4d9d84f6f8231dfd9dca4da5f4b2f0aa932 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Tue, 2 Mar 2021 09:19:33 -0800
Subject: bpf: Import syscall arg documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These descriptions are present in the man-pages project from the
original submissions around 2015-2016. Import them so that they can be
kept up to date as developers extend the bpf syscall commands.

These descriptions follow the pattern used by scripts/bpf_helpers_doc.py
so that we can take advantage of the parser to generate more up-to-date
man page writing based upon these headers.

Some minor wording adjustments were made to make the descriptions
more consistent for the description / return format.

Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210302171947.2268128-2-joe@cilium.io

Co-authored-by: Alexei Starovoitov <ast@kernel.org>
Co-authored-by: Michael Kerrisk <mtk.manpages@gmail.com>
---
 include/uapi/linux/bpf.h | 122 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 121 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b89af20cfa19..fb16c590e6d9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -93,7 +93,127 @@ union bpf_iter_link_info {
 	} map;
 };
 
-/* BPF syscall commands, see bpf(2) man-page for details. */
+/* BPF syscall commands, see bpf(2) man-page for more details. */
+/**
+ * DOC: eBPF Syscall Preamble
+ *
+ * The operation to be performed by the **bpf**\ () system call is determined
+ * by the *cmd* argument. Each operation takes an accompanying argument,
+ * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see
+ * below). The size argument is the size of the union pointed to by *attr*.
+ */
+/**
+ * DOC: eBPF Syscall Commands
+ *
+ * BPF_MAP_CREATE
+ *	Description
+ *		Create a map and return a file descriptor that refers to the
+ *		map. The close-on-exec file descriptor flag (see **fcntl**\ (2))
+ *		is automatically enabled for the new file descriptor.
+ *
+ *		Applying **close**\ (2) to the file descriptor returned by
+ *		**BPF_MAP_CREATE** will delete the map (but see NOTES).
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_MAP_LOOKUP_ELEM
+ *	Description
+ *		Look up an element with a given *key* in the map referred to
+ *		by the file descriptor *map_fd*.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_MAP_UPDATE_ELEM
+ *	Description
+ *		Create or update an element (key/value pair) in a specified map.
+ *
+ *		The *flags* argument should be specified as one of the
+ *		following:
+ *
+ *		**BPF_ANY**
+ *			Create a new element or update an existing element.
+ *		**BPF_NOEXIST**
+ *			Create a new element only if it did not exist.
+ *		**BPF_EXIST**
+ *			Update an existing element.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ *		May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**,
+ *		**E2BIG**, **EEXIST**, or **ENOENT**.
+ *
+ *		**E2BIG**
+ *			The number of elements in the map reached the
+ *			*max_entries* limit specified at map creation time.
+ *		**EEXIST**
+ *			If *flags* specifies **BPF_NOEXIST** and the element
+ *			with *key* already exists in the map.
+ *		**ENOENT**
+ *			If *flags* specifies **BPF_EXIST** and the element with
+ *			*key* does not exist in the map.
+ *
+ * BPF_MAP_DELETE_ELEM
+ *	Description
+ *		Look up and delete an element by key in a specified map.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_MAP_GET_NEXT_KEY
+ *	Description
+ *		Look up an element by key in a specified map and return the key
+ *		of the next element. Can be used to iterate over all elements
+ *		in the map.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ *		The following cases can be used to iterate over all elements of
+ *		the map:
+ *
+ *		* If *key* is not found, the operation returns zero and sets
+ *		  the *next_key* pointer to the key of the first element.
+ *		* If *key* is found, the operation returns zero and sets the
+ *		  *next_key* pointer to the key of the next element.
+ *		* If *key* is the last element, returns -1 and *errno* is set
+ *		  to **ENOENT**.
+ *
+ *		May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or
+ *		**EINVAL** on error.
+ *
+ * BPF_PROG_LOAD
+ *	Description
+ *		Verify and load an eBPF program, returning a new file
+ *		descriptor associated with the program.
+ *
+ *		Applying **close**\ (2) to the file descriptor returned by
+ *		**BPF_PROG_LOAD** will unload the eBPF program (but see NOTES).
+ *
+ *		The close-on-exec file descriptor flag (see **fcntl**\ (2)) is
+ *		automatically enabled for the new file descriptor.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * NOTES
+ *	eBPF objects (maps and programs) can be shared between processes.
+ *	For example, after **fork**\ (2), the child inherits file descriptors
+ *	referring to the same eBPF objects. In addition, file descriptors
+ *	referring to eBPF objects can be transferred over UNIX domain sockets.
+ *	File descriptors referring to eBPF objects can be duplicated in the
+ *	usual way, using **dup**\ (2) and similar calls. An eBPF object is
+ *	deallocated only after all file descriptors referring to the object
+ *	have been closed.
+ */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
 	BPF_MAP_LOOKUP_ELEM,
-- 
cgit v1.2.3-71-gd317


From f67c9cbf6c581468f6c7144d497565cfc7918c31 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Tue, 2 Mar 2021 09:19:34 -0800
Subject: bpf: Add minimal bpf() command documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce high-level descriptions of the intent and return codes of the
bpf() syscall commands. Subsequent patches may further flesh out the
content to provide a more useful programming reference.

Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210302171947.2268128-3-joe@cilium.io
---
 include/uapi/linux/bpf.h | 368 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 368 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fb16c590e6d9..052bbfe65f77 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -204,6 +204,374 @@ union bpf_iter_link_info {
  *		A new file descriptor (a nonnegative integer), or -1 if an
  *		error occurred (in which case, *errno* is set appropriately).
  *
+ * BPF_OBJ_PIN
+ *	Description
+ *		Pin an eBPF program or map referred by the specified *bpf_fd*
+ *		to the provided *pathname* on the filesystem.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_OBJ_GET
+ *	Description
+ *		Open a file descriptor for the eBPF object pinned to the
+ *		specified *pathname*.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_PROG_ATTACH
+ *	Description
+ *		Attach an eBPF program to a *target_fd* at the specified
+ *		*attach_type* hook.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_PROG_DETACH
+ *	Description
+ *		Detach the eBPF program associated with the *target_fd* at the
+ *		hook specified by *attach_type*. The program must have been
+ *		previously attached using **BPF_PROG_ATTACH**.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_PROG_TEST_RUN
+ *	Description
+ *		Run an eBPF program a number of times against a provided
+ *		program context and return the modified program context and
+ *		duration of the test run.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_PROG_GET_NEXT_ID
+ *	Description
+ *		Fetch the next eBPF program currently loaded into the kernel.
+ *
+ *		Looks for the eBPF program with an id greater than *start_id*
+ *		and updates *next_id* on success. If no other eBPF programs
+ *		remain with ids higher than *start_id*, returns -1 and sets
+ *		*errno* to **ENOENT**.
+ *
+ *	Return
+ *		Returns zero on success. On error, or when no id remains, -1
+ *		is returned and *errno* is set appropriately.
+ *
+ * BPF_MAP_GET_NEXT_ID
+ *	Description
+ *		Fetch the next eBPF map currently loaded into the kernel.
+ *
+ *		Looks for the eBPF map with an id greater than *start_id*
+ *		and updates *next_id* on success. If no other eBPF maps
+ *		remain with ids higher than *start_id*, returns -1 and sets
+ *		*errno* to **ENOENT**.
+ *
+ *	Return
+ *		Returns zero on success. On error, or when no id remains, -1
+ *		is returned and *errno* is set appropriately.
+ *
+ * BPF_PROG_GET_FD_BY_ID
+ *	Description
+ *		Open a file descriptor for the eBPF program corresponding to
+ *		*prog_id*.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_MAP_GET_FD_BY_ID
+ *	Description
+ *		Open a file descriptor for the eBPF map corresponding to
+ *		*map_id*.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_OBJ_GET_INFO_BY_FD
+ *	Description
+ *		Obtain information about the eBPF object corresponding to
+ *		*bpf_fd*.
+ *
+ *		Populates up to *info_len* bytes of *info*, which will be in
+ *		one of the following formats depending on the eBPF object type
+ *		of *bpf_fd*:
+ *
+ *		* **struct bpf_prog_info**
+ *		* **struct bpf_map_info**
+ *		* **struct bpf_btf_info**
+ *		* **struct bpf_link_info**
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_PROG_QUERY
+ *	Description
+ *		Obtain information about eBPF programs associated with the
+ *		specified *attach_type* hook.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_RAW_TRACEPOINT_OPEN
+ *	Description
+ *		Attach an eBPF program to a tracepoint *name* to access kernel
+ *		internal arguments of the tracepoint in their raw form.
+ *
+ *		The *prog_fd* must be a valid file descriptor associated with
+ *		a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**.
+ *
+ *		No ABI guarantees are made about the content of tracepoint
+ *		arguments exposed to the corresponding eBPF program.
+ *
+ *		Applying **close**\ (2) to the file descriptor returned by
+ *		**BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES).
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_BTF_LOAD
+ *	Description
+ *		Verify and load BPF Type Format (BTF) metadata into the kernel,
+ *		returning a new file descriptor associated with the metadata.
+ *		BTF is described in more detail at
+ *		https://www.kernel.org/doc/html/latest/bpf/btf.html.
+ *
+ *		The *btf* parameter must point to valid memory providing
+ *		*btf_size* bytes of BTF binary metadata.
+ *
+ *		The returned file descriptor can be passed to other **bpf**\ ()
+ *		subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to
+ *		associate the BTF with those objects.
+ *
+ *		Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional
+ *		parameters to specify a *btf_log_buf*, *btf_log_size* and
+ *		*btf_log_level* which allow the kernel to return freeform log
+ *		output regarding the BTF verification process.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_BTF_GET_FD_BY_ID
+ *	Description
+ *		Open a file descriptor for the BPF Type Format (BTF)
+ *		corresponding to *btf_id*.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_TASK_FD_QUERY
+ *	Description
+ *		Obtain information about eBPF programs associated with the
+ *		target process identified by *pid* and *fd*.
+ *
+ *		If the *pid* and *fd* are associated with a tracepoint, kprobe
+ *		or uprobe perf event, then the *prog_id* and *fd_type* will
+ *		be populated with the eBPF program id and file descriptor type
+ *		of type **bpf_task_fd_type**. If associated with a kprobe or
+ *		uprobe, the  *probe_offset* and *probe_addr* will also be
+ *		populated. Optionally, if *buf* is provided, then up to
+ *		*buf_len* bytes of *buf* will be populated with the name of
+ *		the tracepoint, kprobe or uprobe.
+ *
+ *		The resulting *prog_id* may be introspected in deeper detail
+ *		using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_AND_DELETE_ELEM
+ *	Description
+ *		Look up an element with the given *key* in the map referred to
+ *		by the file descriptor *fd*, and if found, delete the element.
+ *
+ *		The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
+ *		implement this command as a "pop" operation, deleting the top
+ *		element rather than one corresponding to *key*.
+ *		The *key* and *key_len* parameters should be zeroed when
+ *		issuing this operation for these map types.
+ *
+ *		This command is only valid for the following map types:
+ *		* **BPF_MAP_TYPE_QUEUE**
+ *		* **BPF_MAP_TYPE_STACK**
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_MAP_FREEZE
+ *	Description
+ *		Freeze the permissions of the specified map.
+ *
+ *		Write permissions may be frozen by passing zero *flags*.
+ *		Upon success, no future syscall invocations may alter the
+ *		map state of *map_fd*. Write operations from eBPF programs
+ *		are still possible for a frozen map.
+ *
+ *		Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_BTF_GET_NEXT_ID
+ *	Description
+ *		Fetch the next BPF Type Format (BTF) object currently loaded
+ *		into the kernel.
+ *
+ *		Looks for the BTF object with an id greater than *start_id*
+ *		and updates *next_id* on success. If no other BTF objects
+ *		remain with ids higher than *start_id*, returns -1 and sets
+ *		*errno* to **ENOENT**.
+ *
+ *	Return
+ *		Returns zero on success. On error, or when no id remains, -1
+ *		is returned and *errno* is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_BATCH
+ *	Description
+ *		Iterate and fetch multiple elements in a map.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_AND_DELETE_BATCH
+ *	Description
+ *		Iterate and delete multiple elements in a map.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_MAP_UPDATE_BATCH
+ *	Description
+ *		Iterate and update multiple elements in a map.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_MAP_DELETE_BATCH
+ *	Description
+ *		Iterate and delete multiple elements in a map.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_LINK_CREATE
+ *	Description
+ *		Attach an eBPF program to a *target_fd* at the specified
+ *		*attach_type* hook and return a file descriptor handle for
+ *		managing the link.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_UPDATE
+ *	Description
+ *		Update the eBPF program in the specified *link_fd* to
+ *		*new_prog_fd*.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_LINK_GET_FD_BY_ID
+ *	Description
+ *		Open a file descriptor for the eBPF Link corresponding to
+ *		*link_id*.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_GET_NEXT_ID
+ *	Description
+ *		Fetch the next eBPF link currently loaded into the kernel.
+ *
+ *		Looks for the eBPF link with an id greater than *start_id*
+ *		and updates *next_id* on success. If no other eBPF links
+ *		remain with ids higher than *start_id*, returns -1 and sets
+ *		*errno* to **ENOENT**.
+ *
+ *	Return
+ *		Returns zero on success. On error, or when no id remains, -1
+ *		is returned and *errno* is set appropriately.
+ *
+ * BPF_ENABLE_STATS
+ *	Description
+ *		Enable eBPF runtime statistics gathering.
+ *
+ *		Runtime statistics gathering for the eBPF runtime is disabled
+ *		by default to minimize the corresponding performance overhead.
+ *		This command enables statistics globally.
+ *
+ *		Multiple programs may independently enable statistics.
+ *		After gathering the desired statistics, eBPF runtime statistics
+ *		may be disabled again by calling **close**\ (2) for the file
+ *		descriptor returned by this function. Statistics will only be
+ *		disabled system-wide when all outstanding file descriptors
+ *		returned by prior calls for this subcommand are closed.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_ITER_CREATE
+ *	Description
+ *		Create an iterator on top of the specified *link_fd* (as
+ *		previously created using **BPF_LINK_CREATE**) and return a
+ *		file descriptor that can be used to trigger the iteration.
+ *
+ *		If the resulting file descriptor is pinned to the filesystem
+ *		using  **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls
+ *		for that path will trigger the iterator to read kernel state
+ *		using the eBPF program attached to *link_fd*.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_DETACH
+ *	Description
+ *		Forcefully detach the specified *link_fd* from its
+ *		corresponding attachment point.
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
+ * BPF_PROG_BIND_MAP
+ *	Description
+ *		Bind a map to the lifetime of an eBPF program.
+ *
+ *		The map identified by *map_fd* is bound to the program
+ *		identified by *prog_fd* and only released when *prog_fd* is
+ *		released. This may be used in cases where metadata should be
+ *		associated with a program which otherwise does not contain any
+ *		references to the map (for example, embedded in the eBPF
+ *		program instructions).
+ *
+ *	Return
+ *		Returns zero on success. On error, -1 is returned and *errno*
+ *		is set appropriately.
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *	For example, after **fork**\ (2), the child inherits file descriptors
-- 
cgit v1.2.3-71-gd317


From 6690523bccb3e44cfcc4b2c995767e6814046e34 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Tue, 2 Mar 2021 09:19:35 -0800
Subject: bpf: Document BPF_F_LOCK in syscall commands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Document the meaning of the BPF_F_LOCK flag for the map lookup/update
descriptions. Based on commit 96049f3afd50 ("bpf: introduce BPF_F_LOCK
flag").

Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210302171947.2268128-4-joe@cilium.io
---
 include/uapi/linux/bpf.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 052bbfe65f77..eb9f059f0569 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -123,6 +123,14 @@ union bpf_iter_link_info {
  *		Look up an element with a given *key* in the map referred to
  *		by the file descriptor *map_fd*.
  *
+ *		The *flags* argument may be specified as one of the
+ *		following:
+ *
+ *		**BPF_F_LOCK**
+ *			Look up the value of a spin-locked map without
+ *			returning the lock. This must be specified if the
+ *			elements contain a spinlock.
+ *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
@@ -140,6 +148,8 @@ union bpf_iter_link_info {
  *			Create a new element only if it did not exist.
  *		**BPF_EXIST**
  *			Update an existing element.
+ *		**BPF_F_LOCK**
+ *			Update a spin_lock-ed map element.
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
-- 
cgit v1.2.3-71-gd317


From 8aacb3c8d1a32b23c82645051bba55f0ae6c103b Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Tue, 2 Mar 2021 09:19:36 -0800
Subject: bpf: Document BPF_PROG_PIN syscall command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit b2197755b263 ("bpf: add support for persistent maps/progs")
contains the original implementation and git logs, used as reference for
this documentation.

Also pull in the filename restriction as documented in commit 6d8cb045cde6
("bpf: comment why dots in filenames under BPF virtual FS are not allowed")

Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210302171947.2268128-5-joe@cilium.io
---
 include/uapi/linux/bpf.h | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index eb9f059f0569..6946dde90c56 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -219,6 +219,22 @@ union bpf_iter_link_info {
  *		Pin an eBPF program or map referred by the specified *bpf_fd*
  *		to the provided *pathname* on the filesystem.
  *
+ *		The *pathname* argument must not contain a dot (".").
+ *
+ *		On success, *pathname* retains a reference to the eBPF object,
+ *		preventing deallocation of the object when the original
+ *		*bpf_fd* is closed. This allow the eBPF object to live beyond
+ *		**close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent
+ *		process.
+ *
+ *		Applying **unlink**\ (2) or similar calls to the *pathname*
+ *		unpins the object from the filesystem, removing the reference.
+ *		If no other file descriptors or filesystem nodes refer to the
+ *		same object, it will be deallocated (see NOTES).
+ *
+ *		The filesystem type for the parent directory of *pathname* must
+ *		be **BPF_FS_MAGIC**.
+ *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
@@ -584,13 +600,19 @@ union bpf_iter_link_info {
  *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
- *	For example, after **fork**\ (2), the child inherits file descriptors
- *	referring to the same eBPF objects. In addition, file descriptors
- *	referring to eBPF objects can be transferred over UNIX domain sockets.
- *	File descriptors referring to eBPF objects can be duplicated in the
- *	usual way, using **dup**\ (2) and similar calls. An eBPF object is
- *	deallocated only after all file descriptors referring to the object
- *	have been closed.
+ *
+ *	* After **fork**\ (2), the child inherits file descriptors
+ *	  referring to the same eBPF objects.
+ *	* File descriptors referring to eBPF objects can be transferred over
+ *	  **unix**\ (7) domain sockets.
+ *	* File descriptors referring to eBPF objects can be duplicated in the
+ *	  usual way, using **dup**\ (2) and similar calls.
+ *	* File descriptors referring to eBPF objects can be pinned to the
+ *	  filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2).
+ *
+ *	An eBPF object is deallocated only after all file descriptors referring
+ *	to the object have been closed and no references remain pinned to the
+ *	filesystem or attached (for example, bound to a program or device).
  */
 enum bpf_cmd {
 	BPF_MAP_CREATE,
-- 
cgit v1.2.3-71-gd317


From 32e76b187a90de5809d68c2ef3e3964176dacaf0 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Tue, 2 Mar 2021 09:19:37 -0800
Subject: bpf: Document BPF_PROG_ATTACH syscall command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Document the prog attach command in more detail, based on git commits:
* commit f4324551489e ("bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH
  commands")
* commit 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor
  socket TX/RX data")
* commit f4364dcfc86d ("media: rc: introduce BPF_PROG_LIRC_MODE2")
* commit d58e468b1112 ("flow_dissector: implements flow dissector BPF
  hook")

Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210302171947.2268128-6-joe@cilium.io
---
 include/uapi/linux/bpf.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6946dde90c56..a8f2964ec885 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -253,6 +253,43 @@ union bpf_iter_link_info {
  *		Attach an eBPF program to a *target_fd* at the specified
  *		*attach_type* hook.
  *
+ *		The *attach_type* specifies the eBPF attachment point to
+ *		attach the program to, and must be one of *bpf_attach_type*
+ *		(see below).
+ *
+ *		The *attach_bpf_fd* must be a valid file descriptor for a
+ *		loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap
+ *		or sock_ops type corresponding to the specified *attach_type*.
+ *
+ *		The *target_fd* must be a valid file descriptor for a kernel
+ *		object which depends on the attach type of *attach_bpf_fd*:
+ *
+ *		**BPF_PROG_TYPE_CGROUP_DEVICE**,
+ *		**BPF_PROG_TYPE_CGROUP_SKB**,
+ *		**BPF_PROG_TYPE_CGROUP_SOCK**,
+ *		**BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ *		**BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ *		**BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ *		**BPF_PROG_TYPE_SOCK_OPS**
+ *
+ *			Control Group v2 hierarchy with the eBPF controller
+ *			enabled. Requires the kernel to be compiled with
+ *			**CONFIG_CGROUP_BPF**.
+ *
+ *		**BPF_PROG_TYPE_FLOW_DISSECTOR**
+ *
+ *			Network namespace (eg /proc/self/ns/net).
+ *
+ *		**BPF_PROG_TYPE_LIRC_MODE2**
+ *
+ *			LIRC device path (eg /dev/lircN). Requires the kernel
+ *			to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ *
+ *		**BPF_PROG_TYPE_SK_SKB**,
+ *		**BPF_PROG_TYPE_SK_MSG**
+ *
+ *			eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**).
+ *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
-- 
cgit v1.2.3-71-gd317


From 2a3fdca4e3bc7a01316277ba26f4090c4b19bf7c Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Tue, 2 Mar 2021 09:19:38 -0800
Subject: bpf: Document BPF_PROG_TEST_RUN syscall command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on a brief read of the corresponding source code.

Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210302171947.2268128-7-joe@cilium.io
---
 include/uapi/linux/bpf.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a8f2964ec885..a6cd6650e23d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -306,14 +306,22 @@ union bpf_iter_link_info {
  *
  * BPF_PROG_TEST_RUN
  *	Description
- *		Run an eBPF program a number of times against a provided
- *		program context and return the modified program context and
- *		duration of the test run.
+ *		Run the eBPF program associated with the *prog_fd* a *repeat*
+ *		number of times against a provided program context *ctx_in* and
+ *		data *data_in*, and return the modified program context
+ *		*ctx_out*, *data_out* (for example, packet data), result of the
+ *		execution *retval*, and *duration* of the test run.
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
+ *		**ENOSPC**
+ *			Either *data_size_out* or *ctx_size_out* is too small.
+ *		**ENOTSUPP**
+ *			This command is not supported by the program type of
+ *			the program referred to by *prog_fd*.
+ *
  * BPF_PROG_GET_NEXT_ID
  *	Description
  *		Fetch the next eBPF program currently loaded into the kernel.
-- 
cgit v1.2.3-71-gd317


From 5d999994e05d62d4f53059540652014cf83cddfe Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Tue, 2 Mar 2021 09:19:39 -0800
Subject: bpf: Document BPF_PROG_QUERY syscall command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 468e2f64d220 ("bpf: introduce BPF_PROG_QUERY command") originally
introduced this, but there have been several additions since then.
Unlike BPF_PROG_ATTACH, it appears that the sockmap progs are not able
to be queried so far.

Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210302171947.2268128-8-joe@cilium.io
---
 include/uapi/linux/bpf.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a6cd6650e23d..0cf92ef011f1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -389,6 +389,43 @@ union bpf_iter_link_info {
  *		Obtain information about eBPF programs associated with the
  *		specified *attach_type* hook.
  *
+ *		The *target_fd* must be a valid file descriptor for a kernel
+ *		object which depends on the attach type of *attach_bpf_fd*:
+ *
+ *		**BPF_PROG_TYPE_CGROUP_DEVICE**,
+ *		**BPF_PROG_TYPE_CGROUP_SKB**,
+ *		**BPF_PROG_TYPE_CGROUP_SOCK**,
+ *		**BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ *		**BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ *		**BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ *		**BPF_PROG_TYPE_SOCK_OPS**
+ *
+ *			Control Group v2 hierarchy with the eBPF controller
+ *			enabled. Requires the kernel to be compiled with
+ *			**CONFIG_CGROUP_BPF**.
+ *
+ *		**BPF_PROG_TYPE_FLOW_DISSECTOR**
+ *
+ *			Network namespace (eg /proc/self/ns/net).
+ *
+ *		**BPF_PROG_TYPE_LIRC_MODE2**
+ *
+ *			LIRC device path (eg /dev/lircN). Requires the kernel
+ *			to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ *
+ *		**BPF_PROG_QUERY** always fetches the number of programs
+ *		attached and the *attach_flags* which were used to attach those
+ *		programs. Additionally, if *prog_ids* is nonzero and the number
+ *		of attached programs is less than *prog_cnt*, populates
+ *		*prog_ids* with the eBPF program ids of the programs attached
+ *		at *target_fd*.
+ *
+ *		The following flags may alter the result:
+ *
+ *		**BPF_F_QUERY_EFFECTIVE**
+ *			Only return information regarding programs which are
+ *			currently effective at the specified *target_fd*.
+ *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
-- 
cgit v1.2.3-71-gd317


From 0cb804547927c05f6aa7e28c8d4a1e02fec1a6d4 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Tue, 2 Mar 2021 09:19:40 -0800
Subject: bpf: Document BPF_MAP_*_BATCH syscall commands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based roughly on the following commits:
* Commit cb4d03ab499d ("bpf: Add generic support for lookup batch op")
* Commit 057996380a42 ("bpf: Add batch ops to all htab bpf map")
* Commit aa2e93b8e58e ("bpf: Add generic support for update and delete
  batch ops")

Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Brian Vazquez <brianvv@google.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210302171947.2268128-9-joe@cilium.io
---
 include/uapi/linux/bpf.h | 114 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 111 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0cf92ef011f1..c8b9d19fce22 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -553,13 +553,55 @@ union bpf_iter_link_info {
  *	Description
  *		Iterate and fetch multiple elements in a map.
  *
+ *		Two opaque values are used to manage batch operations,
+ *		*in_batch* and *out_batch*. Initially, *in_batch* must be set
+ *		to NULL to begin the batched operation. After each subsequent
+ *		**BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant
+ *		*out_batch* as the *in_batch* for the next operation to
+ *		continue iteration from the current point.
+ *
+ *		The *keys* and *values* are output parameters which must point
+ *		to memory large enough to hold *count* items based on the key
+ *		and value size of the map *map_fd*. The *keys* buffer must be
+ *		of *key_size* * *count*. The *values* buffer must be of
+ *		*value_size* * *count*.
+ *
+ *		The *elem_flags* argument may be specified as one of the
+ *		following:
+ *
+ *		**BPF_F_LOCK**
+ *			Look up the value of a spin-locked map without
+ *			returning the lock. This must be specified if the
+ *			elements contain a spinlock.
+ *
+ *		On success, *count* elements from the map are copied into the
+ *		user buffer, with the keys copied into *keys* and the values
+ *		copied into the corresponding indices in *values*.
+ *
+ *		If an error is returned and *errno* is not **EFAULT**, *count*
+ *		is set to the number of successfully processed elements.
+ *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
+ *		May set *errno* to **ENOSPC** to indicate that *keys* or
+ *		*values* is too small to dump an entire bucket during
+ *		iteration of a hash-based map type.
+ *
  * BPF_MAP_LOOKUP_AND_DELETE_BATCH
  *	Description
- *		Iterate and delete multiple elements in a map.
+ *		Iterate and delete all elements in a map.
+ *
+ *		This operation has the same behavior as
+ *		**BPF_MAP_LOOKUP_BATCH** with two exceptions:
+ *
+ *		* Every element that is successfully returned is also deleted
+ *		  from the map. This is at least *count* elements. Note that
+ *		  *count* is both an input and an output parameter.
+ *		* Upon returning with *errno* set to **EFAULT**, up to
+ *		  *count* elements may be deleted without returning the keys
+ *		  and values of the deleted elements.
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
@@ -567,15 +609,81 @@ union bpf_iter_link_info {
  *
  * BPF_MAP_UPDATE_BATCH
  *	Description
- *		Iterate and update multiple elements in a map.
+ *		Update multiple elements in a map by *key*.
+ *
+ *		The *keys* and *values* are input parameters which must point
+ *		to memory large enough to hold *count* items based on the key
+ *		and value size of the map *map_fd*. The *keys* buffer must be
+ *		of *key_size* * *count*. The *values* buffer must be of
+ *		*value_size* * *count*.
+ *
+ *		Each element specified in *keys* is sequentially updated to the
+ *		value in the corresponding index in *values*. The *in_batch*
+ *		and *out_batch* parameters are ignored and should be zeroed.
+ *
+ *		The *elem_flags* argument should be specified as one of the
+ *		following:
+ *
+ *		**BPF_ANY**
+ *			Create new elements or update a existing elements.
+ *		**BPF_NOEXIST**
+ *			Create new elements only if they do not exist.
+ *		**BPF_EXIST**
+ *			Update existing elements.
+ *		**BPF_F_LOCK**
+ *			Update spin_lock-ed map elements. This must be
+ *			specified if the map value contains a spinlock.
+ *
+ *		On success, *count* elements from the map are updated.
+ *
+ *		If an error is returned and *errno* is not **EFAULT**, *count*
+ *		is set to the number of successfully processed elements.
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
+ *		May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or
+ *		**E2BIG**. **E2BIG** indicates that the number of elements in
+ *		the map reached the *max_entries* limit specified at map
+ *		creation time.
+ *
+ *		May set *errno* to one of the following error codes under
+ *		specific circumstances:
+ *
+ *		**EEXIST**
+ *			If *flags* specifies **BPF_NOEXIST** and the element
+ *			with *key* already exists in the map.
+ *		**ENOENT**
+ *			If *flags* specifies **BPF_EXIST** and the element with
+ *			*key* does not exist in the map.
+ *
  * BPF_MAP_DELETE_BATCH
  *	Description
- *		Iterate and delete multiple elements in a map.
+ *		Delete multiple elements in a map by *key*.
+ *
+ *		The *keys* parameter is an input parameter which must point
+ *		to memory large enough to hold *count* items based on the key
+ *		size of the map *map_fd*, that is, *key_size* * *count*.
+ *
+ *		Each element specified in *keys* is sequentially deleted. The
+ *		*in_batch*, *out_batch*, and *values* parameters are ignored
+ *		and should be zeroed.
+ *
+ *		The *elem_flags* argument may be specified as one of the
+ *		following:
+ *
+ *		**BPF_F_LOCK**
+ *			Look up the value of a spin-locked map without
+ *			returning the lock. This must be specified if the
+ *			elements contain a spinlock.
+ *
+ *		On success, *count* elements from the map are updated.
+ *
+ *		If an error is returned and *errno* is not **EFAULT**, *count*
+ *		is set to the number of successfully processed elements. If
+ *		*errno* is **EFAULT**, up to *count* elements may be been
+ *		deleted.
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
-- 
cgit v1.2.3-71-gd317


From 923a932c982fd71856f80dbeaaa3ca41a75e89e0 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Tue, 2 Mar 2021 09:19:41 -0800
Subject: scripts/bpf: Abstract eBPF API target parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Abstract out the target parameter so that upcoming commits, more than
just the existing "helpers" target can be called to generate specific
portions of docs from the eBPF UAPI headers.

Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210302171947.2268128-10-joe@cilium.io
---
 MAINTAINERS                    |   1 +
 include/uapi/linux/bpf.h       |   2 +-
 scripts/bpf_doc.py             | 650 +++++++++++++++++++++++++++++++++++++++++
 scripts/bpf_helpers_doc.py     | 615 --------------------------------------
 tools/bpf/Makefile.helpers     |   2 +-
 tools/include/uapi/linux/bpf.h |   2 +-
 tools/lib/bpf/Makefile         |   2 +-
 tools/perf/MANIFEST            |   2 +-
 8 files changed, 656 insertions(+), 620 deletions(-)
 create mode 100755 scripts/bpf_doc.py
 delete mode 100755 scripts/bpf_helpers_doc.py

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index a50a543e3c81..8d56c7044067 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3223,6 +3223,7 @@ F:	net/core/filter.c
 F:	net/sched/act_bpf.c
 F:	net/sched/cls_bpf.c
 F:	samples/bpf/
+F:	scripts/bpf_doc.py
 F:	tools/bpf/
 F:	tools/lib/bpf/
 F:	tools/testing/selftests/bpf/
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8b9d19fce22..63a56ed6a785 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1439,7 +1439,7 @@ union bpf_attr {
  * parsed and used to produce a manual page. The workflow is the following,
  * and requires the rst2man utility:
  *
- *     $ ./scripts/bpf_helpers_doc.py \
+ *     $ ./scripts/bpf_doc.py \
  *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
  *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
  *     $ man /tmp/bpf-helpers.7
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
new file mode 100755
index 000000000000..5a4f68aab335
--- /dev/null
+++ b/scripts/bpf_doc.py
@@ -0,0 +1,650 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (C) 2018-2019 Netronome Systems, Inc.
+# Copyright (C) 2021 Isovalent, Inc.
+
+# In case user attempts to run with Python 2.
+from __future__ import print_function
+
+import argparse
+import re
+import sys, os
+
+class NoHelperFound(BaseException):
+    pass
+
+class ParsingError(BaseException):
+    def __init__(self, line='<line not provided>', reader=None):
+        if reader:
+            BaseException.__init__(self,
+                                   'Error at file offset %d, parsing line: %s' %
+                                   (reader.tell(), line))
+        else:
+            BaseException.__init__(self, 'Error parsing line: %s' % line)
+
+class Helper(object):
+    """
+    An object representing the description of an eBPF helper function.
+    @proto: function prototype of the helper function
+    @desc: textual description of the helper function
+    @ret: description of the return value of the helper function
+    """
+    def __init__(self, proto='', desc='', ret=''):
+        self.proto = proto
+        self.desc = desc
+        self.ret = ret
+
+    def proto_break_down(self):
+        """
+        Break down helper function protocol into smaller chunks: return type,
+        name, distincts arguments.
+        """
+        arg_re = re.compile('((\w+ )*?(\w+|...))( (\**)(\w+))?$')
+        res = {}
+        proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
+
+        capture = proto_re.match(self.proto)
+        res['ret_type'] = capture.group(1)
+        res['ret_star'] = capture.group(2)
+        res['name']     = capture.group(3)
+        res['args'] = []
+
+        args    = capture.group(4).split(', ')
+        for a in args:
+            capture = arg_re.match(a)
+            res['args'].append({
+                'type' : capture.group(1),
+                'star' : capture.group(5),
+                'name' : capture.group(6)
+            })
+
+        return res
+
+class HeaderParser(object):
+    """
+    An object used to parse a file in order to extract the documentation of a
+    list of eBPF helper functions. All the helpers that can be retrieved are
+    stored as Helper object, in the self.helpers() array.
+    @filename: name of file to parse, usually include/uapi/linux/bpf.h in the
+               kernel tree
+    """
+    def __init__(self, filename):
+        self.reader = open(filename, 'r')
+        self.line = ''
+        self.helpers = []
+
+    def parse_helper(self):
+        proto    = self.parse_proto()
+        desc     = self.parse_desc()
+        ret      = self.parse_ret()
+        return Helper(proto=proto, desc=desc, ret=ret)
+
+    def parse_proto(self):
+        # Argument can be of shape:
+        #   - "void"
+        #   - "type  name"
+        #   - "type *name"
+        #   - Same as above, with "const" and/or "struct" in front of type
+        #   - "..." (undefined number of arguments, for bpf_trace_printk())
+        # There is at least one term ("void"), and at most five arguments.
+        p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
+        capture = p.match(self.line)
+        if not capture:
+            raise NoHelperFound
+        self.line = self.reader.readline()
+        return capture.group(1)
+
+    def parse_desc(self):
+        p = re.compile(' \* ?(?:\t| {5,8})Description$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty description and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Description can be several lines, some of them possibly empty, and it
+        # stops when another subsection title is met.
+        desc = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                desc += '\n'
+            else:
+                p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    desc += capture.group(1) + '\n'
+                else:
+                    break
+        return desc
+
+    def parse_ret(self):
+        p = re.compile(' \* ?(?:\t| {5,8})Return$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty retval and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Return value description can be several lines, some of them possibly
+        # empty, and it stops when another subsection title is met.
+        ret = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                ret += '\n'
+            else:
+                p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    ret += capture.group(1) + '\n'
+                else:
+                    break
+        return ret
+
+    def run(self):
+        # Advance to start of helper function descriptions.
+        offset = self.reader.read().find('* Start of BPF helper function descriptions:')
+        if offset == -1:
+            raise Exception('Could not find start of eBPF helper descriptions list')
+        self.reader.seek(offset)
+        self.reader.readline()
+        self.reader.readline()
+        self.line = self.reader.readline()
+
+        while True:
+            try:
+                helper = self.parse_helper()
+                self.helpers.append(helper)
+            except NoHelperFound:
+                break
+
+        self.reader.close()
+
+###############################################################################
+
+class Printer(object):
+    """
+    A generic class for printers. Printers should be created with an array of
+    Helper objects, and implement a way to print them in the desired fashion.
+    @parser: A HeaderParser with objects to print to standard output
+    """
+    def __init__(self, parser):
+        self.parser = parser
+        self.elements = []
+
+    def print_header(self):
+        pass
+
+    def print_footer(self):
+        pass
+
+    def print_one(self, helper):
+        pass
+
+    def print_all(self):
+        self.print_header()
+        for elem in self.elements:
+            self.print_one(elem)
+        self.print_footer()
+
+
+class PrinterRST(Printer):
+    """
+    A generic class for printers that print ReStructured Text. Printers should
+    be created with a HeaderParser object, and implement a way to print API
+    elements in the desired fashion.
+    @parser: A HeaderParser with objects to print to standard output
+    """
+    def __init__(self, parser):
+        self.parser = parser
+
+    def print_license(self):
+        license = '''\
+.. Copyright (C) All BPF authors and contributors from 2014 to present.
+.. See git log include/uapi/linux/bpf.h in kernel tree for details.
+.. 
+.. %%%LICENSE_START(VERBATIM)
+.. Permission is granted to make and distribute verbatim copies of this
+.. manual provided the copyright notice and this permission notice are
+.. preserved on all copies.
+.. 
+.. Permission is granted to copy and distribute modified versions of this
+.. manual under the conditions for verbatim copying, provided that the
+.. entire resulting derived work is distributed under the terms of a
+.. permission notice identical to this one.
+.. 
+.. Since the Linux kernel and libraries are constantly changing, this
+.. manual page may be incorrect or out-of-date.  The author(s) assume no
+.. responsibility for errors or omissions, or for damages resulting from
+.. the use of the information contained herein.  The author(s) may not
+.. have taken the same level of care in the production of this manual,
+.. which is licensed free of charge, as they might when working
+.. professionally.
+.. 
+.. Formatted or processed versions of this manual, if unaccompanied by
+.. the source, must acknowledge the copyright and authors of this work.
+.. %%%LICENSE_END
+.. 
+.. Please do not edit this file. It was generated from the documentation
+.. located in file include/uapi/linux/bpf.h of the Linux kernel sources
+.. (helpers description), and from scripts/bpf_doc.py in the same
+.. repository (header and footer).
+'''
+        print(license)
+
+    def print_elem(self, elem):
+        if (elem.desc):
+            print('\tDescription')
+            # Do not strip all newline characters: formatted code at the end of
+            # a section must be followed by a blank line.
+            for line in re.sub('\n$', '', elem.desc, count=1).split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        if (elem.ret):
+            print('\tReturn')
+            for line in elem.ret.rstrip().split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        print('')
+
+
+class PrinterHelpersRST(PrinterRST):
+    """
+    A printer for dumping collected information about helpers as a ReStructured
+    Text page compatible with the rst2man program, which can be used to
+    generate a manual page for the helpers.
+    @parser: A HeaderParser with Helper objects to print to standard output
+    """
+    def __init__(self, parser):
+        self.elements = parser.helpers
+
+    def print_header(self):
+        header = '''\
+===========
+BPF-HELPERS
+===========
+-------------------------------------------------------------------------------
+list of eBPF helper functions
+-------------------------------------------------------------------------------
+
+:Manual section: 7
+
+DESCRIPTION
+===========
+
+The extended Berkeley Packet Filter (eBPF) subsystem consists in programs
+written in a pseudo-assembly language, then attached to one of the several
+kernel hooks and run in reaction of specific events. This framework differs
+from the older, "classic" BPF (or "cBPF") in several aspects, one of them being
+the ability to call special functions (or "helpers") from within a program.
+These functions are restricted to a white-list of helpers defined in the
+kernel.
+
+These helpers are used by eBPF programs to interact with the system, or with
+the context in which they work. For instance, they can be used to print
+debugging messages, to get the time since the system was booted, to interact
+with eBPF maps, or to manipulate network packets. Since there are several eBPF
+program types, and that they do not run in the same context, each program type
+can only call a subset of those helpers.
+
+Due to eBPF conventions, a helper can not have more than five arguments.
+
+Internally, eBPF programs call directly into the compiled helper functions
+without requiring any foreign-function interface. As a result, calling helpers
+introduces no overhead, thus offering excellent performance.
+
+This document is an attempt to list and document the helpers available to eBPF
+developers. They are sorted by chronological order (the oldest helpers in the
+kernel at the top).
+
+HELPERS
+=======
+'''
+        PrinterRST.print_license(self)
+        print(header)
+
+    def print_footer(self):
+        footer = '''
+EXAMPLES
+========
+
+Example usage for most of the eBPF helpers listed in this manual page are
+available within the Linux kernel sources, at the following locations:
+
+* *samples/bpf/*
+* *tools/testing/selftests/bpf/*
+
+LICENSE
+=======
+
+eBPF programs can have an associated license, passed along with the bytecode
+instructions to the kernel when the programs are loaded. The format for that
+string is identical to the one in use for kernel modules (Dual licenses, such
+as "Dual BSD/GPL", may be used). Some helper functions are only accessible to
+programs that are compatible with the GNU Privacy License (GPL).
+
+In order to use such helpers, the eBPF program must be loaded with the correct
+license string passed (via **attr**) to the **bpf**\ () system call, and this
+generally translates into the C source code of the program containing a line
+similar to the following:
+
+::
+
+	char ____license[] __attribute__((section("license"), used)) = "GPL";
+
+IMPLEMENTATION
+==============
+
+This manual page is an effort to document the existing eBPF helper functions.
+But as of this writing, the BPF sub-system is under heavy development. New eBPF
+program or map types are added, along with new helper functions. Some helpers
+are occasionally made available for additional program types. So in spite of
+the efforts of the community, this page might not be up-to-date. If you want to
+check by yourself what helper functions exist in your kernel, or what types of
+programs they can support, here are some files among the kernel tree that you
+may be interested in:
+
+* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list
+  of all helper functions, as well as many other BPF definitions including most
+  of the flags, structs or constants used by the helpers.
+* *net/core/filter.c* contains the definition of most network-related helper
+  functions, and the list of program types from which they can be used.
+* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related
+  helpers.
+* *kernel/bpf/verifier.c* contains the functions used to check that valid types
+  of eBPF maps are used with a given helper function.
+* *kernel/bpf/* directory contains other files in which additional helpers are
+  defined (for cgroups, sockmaps, etc.).
+* The bpftool utility can be used to probe the availability of helper functions
+  on the system (as well as supported program and map types, and a number of
+  other parameters). To do so, run **bpftool feature probe** (see
+  **bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to
+  list features available to unprivileged users.
+
+Compatibility between helper functions and program types can generally be found
+in the files where helper functions are defined. Look for the **struct
+bpf_func_proto** objects and for functions returning them: these functions
+contain a list of helpers that a given program type can call. Note that the
+**default:** label of the **switch ... case** used to filter helpers can call
+other functions, themselves allowing access to additional helpers. The
+requirement for GPL license is also in those **struct bpf_func_proto**.
+
+Compatibility between helper functions and map types can be found in the
+**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*.
+
+Helper functions that invalidate the checks on **data** and **data_end**
+pointers for network processing are listed in function
+**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*.
+
+SEE ALSO
+========
+
+**bpf**\ (2),
+**bpftool**\ (8),
+**cgroups**\ (7),
+**ip**\ (8),
+**perf_event_open**\ (2),
+**sendmsg**\ (2),
+**socket**\ (7),
+**tc-bpf**\ (8)'''
+        print(footer)
+
+    def print_proto(self, helper):
+        """
+        Format function protocol with bold and italics markers. This makes RST
+        file less readable, but gives nice results in the manual page.
+        """
+        proto = helper.proto_break_down()
+
+        print('**%s %s%s(' % (proto['ret_type'],
+                              proto['ret_star'].replace('*', '\\*'),
+                              proto['name']),
+              end='')
+
+        comma = ''
+        for a in proto['args']:
+            one_arg = '{}{}'.format(comma, a['type'])
+            if a['name']:
+                if a['star']:
+                    one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*'))
+                else:
+                    one_arg += '** '
+                one_arg += '*{}*\\ **'.format(a['name'])
+            comma = ', '
+            print(one_arg, end='')
+
+        print(')**')
+
+    def print_one(self, helper):
+        self.print_proto(helper)
+        self.print_elem(helper)
+
+
+
+
+class PrinterHelpers(Printer):
+    """
+    A printer for dumping collected information about helpers as C header to
+    be included from BPF program.
+    @parser: A HeaderParser with Helper objects to print to standard output
+    """
+    def __init__(self, parser):
+        self.elements = parser.helpers
+
+    type_fwds = [
+            'struct bpf_fib_lookup',
+            'struct bpf_sk_lookup',
+            'struct bpf_perf_event_data',
+            'struct bpf_perf_event_value',
+            'struct bpf_pidns_info',
+            'struct bpf_redir_neigh',
+            'struct bpf_sock',
+            'struct bpf_sock_addr',
+            'struct bpf_sock_ops',
+            'struct bpf_sock_tuple',
+            'struct bpf_spin_lock',
+            'struct bpf_sysctl',
+            'struct bpf_tcp_sock',
+            'struct bpf_tunnel_key',
+            'struct bpf_xfrm_state',
+            'struct linux_binprm',
+            'struct pt_regs',
+            'struct sk_reuseport_md',
+            'struct sockaddr',
+            'struct tcphdr',
+            'struct seq_file',
+            'struct tcp6_sock',
+            'struct tcp_sock',
+            'struct tcp_timewait_sock',
+            'struct tcp_request_sock',
+            'struct udp6_sock',
+            'struct task_struct',
+
+            'struct __sk_buff',
+            'struct sk_msg_md',
+            'struct xdp_md',
+            'struct path',
+            'struct btf_ptr',
+            'struct inode',
+            'struct socket',
+            'struct file',
+    ]
+    known_types = {
+            '...',
+            'void',
+            'const void',
+            'char',
+            'const char',
+            'int',
+            'long',
+            'unsigned long',
+
+            '__be16',
+            '__be32',
+            '__wsum',
+
+            'struct bpf_fib_lookup',
+            'struct bpf_perf_event_data',
+            'struct bpf_perf_event_value',
+            'struct bpf_pidns_info',
+            'struct bpf_redir_neigh',
+            'struct bpf_sk_lookup',
+            'struct bpf_sock',
+            'struct bpf_sock_addr',
+            'struct bpf_sock_ops',
+            'struct bpf_sock_tuple',
+            'struct bpf_spin_lock',
+            'struct bpf_sysctl',
+            'struct bpf_tcp_sock',
+            'struct bpf_tunnel_key',
+            'struct bpf_xfrm_state',
+            'struct linux_binprm',
+            'struct pt_regs',
+            'struct sk_reuseport_md',
+            'struct sockaddr',
+            'struct tcphdr',
+            'struct seq_file',
+            'struct tcp6_sock',
+            'struct tcp_sock',
+            'struct tcp_timewait_sock',
+            'struct tcp_request_sock',
+            'struct udp6_sock',
+            'struct task_struct',
+            'struct path',
+            'struct btf_ptr',
+            'struct inode',
+            'struct socket',
+            'struct file',
+    }
+    mapped_types = {
+            'u8': '__u8',
+            'u16': '__u16',
+            'u32': '__u32',
+            'u64': '__u64',
+            's8': '__s8',
+            's16': '__s16',
+            's32': '__s32',
+            's64': '__s64',
+            'size_t': 'unsigned long',
+            'struct bpf_map': 'void',
+            'struct sk_buff': 'struct __sk_buff',
+            'const struct sk_buff': 'const struct __sk_buff',
+            'struct sk_msg_buff': 'struct sk_msg_md',
+            'struct xdp_buff': 'struct xdp_md',
+    }
+    # Helpers overloaded for different context types.
+    overloaded_helpers = [
+        'bpf_get_socket_cookie',
+        'bpf_sk_assign',
+    ]
+
+    def print_header(self):
+        header = '''\
+/* This is auto-generated file. See bpf_doc.py for details. */
+
+/* Forward declarations of BPF structs */'''
+
+        print(header)
+        for fwd in self.type_fwds:
+            print('%s;' % fwd)
+        print('')
+
+    def print_footer(self):
+        footer = ''
+        print(footer)
+
+    def map_type(self, t):
+        if t in self.known_types:
+            return t
+        if t in self.mapped_types:
+            return self.mapped_types[t]
+        print("Unrecognized type '%s', please add it to known types!" % t,
+              file=sys.stderr)
+        sys.exit(1)
+
+    seen_helpers = set()
+
+    def print_one(self, helper):
+        proto = helper.proto_break_down()
+
+        if proto['name'] in self.seen_helpers:
+            return
+        self.seen_helpers.add(proto['name'])
+
+        print('/*')
+        print(" * %s" % proto['name'])
+        print(" *")
+        if (helper.desc):
+            # Do not strip all newline characters: formatted code at the end of
+            # a section must be followed by a blank line.
+            for line in re.sub('\n$', '', helper.desc, count=1).split('\n'):
+                print(' *{}{}'.format(' \t' if line else '', line))
+
+        if (helper.ret):
+            print(' *')
+            print(' * Returns')
+            for line in helper.ret.rstrip().split('\n'):
+                print(' *{}{}'.format(' \t' if line else '', line))
+
+        print(' */')
+        print('static %s %s(*%s)(' % (self.map_type(proto['ret_type']),
+                                      proto['ret_star'], proto['name']), end='')
+        comma = ''
+        for i, a in enumerate(proto['args']):
+            t = a['type']
+            n = a['name']
+            if proto['name'] in self.overloaded_helpers and i == 0:
+                    t = 'void'
+                    n = 'ctx'
+            one_arg = '{}{}'.format(comma, self.map_type(t))
+            if n:
+                if a['star']:
+                    one_arg += ' {}'.format(a['star'])
+                else:
+                    one_arg += ' '
+                one_arg += '{}'.format(n)
+            comma = ', '
+            print(one_arg, end='')
+
+        print(') = (void *) %d;' % len(self.seen_helpers))
+        print('')
+
+###############################################################################
+
+# If script is launched from scripts/ from kernel tree and can access
+# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse,
+# otherwise the --filename argument will be required from the command line.
+script = os.path.abspath(sys.argv[0])
+linuxRoot = os.path.dirname(os.path.dirname(script))
+bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h')
+
+printers = {
+        'helpers': PrinterHelpersRST,
+}
+
+argParser = argparse.ArgumentParser(description="""
+Parse eBPF header file and generate documentation for the eBPF API.
+The RST-formatted output produced can be turned into a manual page with the
+rst2man utility.
+""")
+argParser.add_argument('--header', action='store_true',
+                       help='generate C header file')
+if (os.path.isfile(bpfh)):
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h',
+                           default=bpfh)
+else:
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h')
+argParser.add_argument('target', nargs='?', default='helpers',
+                       choices=printers.keys(), help='eBPF API target')
+args = argParser.parse_args()
+
+# Parse file.
+headerParser = HeaderParser(args.filename)
+headerParser.run()
+
+# Print formatted output to standard output.
+if args.header:
+    printer = PrinterHelpers(headerParser)
+else:
+    printer = printers[args.target](headerParser)
+printer.print_all()
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
deleted file mode 100755
index 867ada23281c..000000000000
--- a/scripts/bpf_helpers_doc.py
+++ /dev/null
@@ -1,615 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Copyright (C) 2018-2019 Netronome Systems, Inc.
-
-# In case user attempts to run with Python 2.
-from __future__ import print_function
-
-import argparse
-import re
-import sys, os
-
-class NoHelperFound(BaseException):
-    pass
-
-class ParsingError(BaseException):
-    def __init__(self, line='<line not provided>', reader=None):
-        if reader:
-            BaseException.__init__(self,
-                                   'Error at file offset %d, parsing line: %s' %
-                                   (reader.tell(), line))
-        else:
-            BaseException.__init__(self, 'Error parsing line: %s' % line)
-
-class Helper(object):
-    """
-    An object representing the description of an eBPF helper function.
-    @proto: function prototype of the helper function
-    @desc: textual description of the helper function
-    @ret: description of the return value of the helper function
-    """
-    def __init__(self, proto='', desc='', ret=''):
-        self.proto = proto
-        self.desc = desc
-        self.ret = ret
-
-    def proto_break_down(self):
-        """
-        Break down helper function protocol into smaller chunks: return type,
-        name, distincts arguments.
-        """
-        arg_re = re.compile('((\w+ )*?(\w+|...))( (\**)(\w+))?$')
-        res = {}
-        proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
-
-        capture = proto_re.match(self.proto)
-        res['ret_type'] = capture.group(1)
-        res['ret_star'] = capture.group(2)
-        res['name']     = capture.group(3)
-        res['args'] = []
-
-        args    = capture.group(4).split(', ')
-        for a in args:
-            capture = arg_re.match(a)
-            res['args'].append({
-                'type' : capture.group(1),
-                'star' : capture.group(5),
-                'name' : capture.group(6)
-            })
-
-        return res
-
-class HeaderParser(object):
-    """
-    An object used to parse a file in order to extract the documentation of a
-    list of eBPF helper functions. All the helpers that can be retrieved are
-    stored as Helper object, in the self.helpers() array.
-    @filename: name of file to parse, usually include/uapi/linux/bpf.h in the
-               kernel tree
-    """
-    def __init__(self, filename):
-        self.reader = open(filename, 'r')
-        self.line = ''
-        self.helpers = []
-
-    def parse_helper(self):
-        proto    = self.parse_proto()
-        desc     = self.parse_desc()
-        ret      = self.parse_ret()
-        return Helper(proto=proto, desc=desc, ret=ret)
-
-    def parse_proto(self):
-        # Argument can be of shape:
-        #   - "void"
-        #   - "type  name"
-        #   - "type *name"
-        #   - Same as above, with "const" and/or "struct" in front of type
-        #   - "..." (undefined number of arguments, for bpf_trace_printk())
-        # There is at least one term ("void"), and at most five arguments.
-        p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
-        capture = p.match(self.line)
-        if not capture:
-            raise NoHelperFound
-        self.line = self.reader.readline()
-        return capture.group(1)
-
-    def parse_desc(self):
-        p = re.compile(' \* ?(?:\t| {5,8})Description$')
-        capture = p.match(self.line)
-        if not capture:
-            # Helper can have empty description and we might be parsing another
-            # attribute: return but do not consume.
-            return ''
-        # Description can be several lines, some of them possibly empty, and it
-        # stops when another subsection title is met.
-        desc = ''
-        while True:
-            self.line = self.reader.readline()
-            if self.line == ' *\n':
-                desc += '\n'
-            else:
-                p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
-                capture = p.match(self.line)
-                if capture:
-                    desc += capture.group(1) + '\n'
-                else:
-                    break
-        return desc
-
-    def parse_ret(self):
-        p = re.compile(' \* ?(?:\t| {5,8})Return$')
-        capture = p.match(self.line)
-        if not capture:
-            # Helper can have empty retval and we might be parsing another
-            # attribute: return but do not consume.
-            return ''
-        # Return value description can be several lines, some of them possibly
-        # empty, and it stops when another subsection title is met.
-        ret = ''
-        while True:
-            self.line = self.reader.readline()
-            if self.line == ' *\n':
-                ret += '\n'
-            else:
-                p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
-                capture = p.match(self.line)
-                if capture:
-                    ret += capture.group(1) + '\n'
-                else:
-                    break
-        return ret
-
-    def run(self):
-        # Advance to start of helper function descriptions.
-        offset = self.reader.read().find('* Start of BPF helper function descriptions:')
-        if offset == -1:
-            raise Exception('Could not find start of eBPF helper descriptions list')
-        self.reader.seek(offset)
-        self.reader.readline()
-        self.reader.readline()
-        self.line = self.reader.readline()
-
-        while True:
-            try:
-                helper = self.parse_helper()
-                self.helpers.append(helper)
-            except NoHelperFound:
-                break
-
-        self.reader.close()
-
-###############################################################################
-
-class Printer(object):
-    """
-    A generic class for printers. Printers should be created with an array of
-    Helper objects, and implement a way to print them in the desired fashion.
-    @helpers: array of Helper objects to print to standard output
-    """
-    def __init__(self, helpers):
-        self.helpers = helpers
-
-    def print_header(self):
-        pass
-
-    def print_footer(self):
-        pass
-
-    def print_one(self, helper):
-        pass
-
-    def print_all(self):
-        self.print_header()
-        for helper in self.helpers:
-            self.print_one(helper)
-        self.print_footer()
-
-class PrinterRST(Printer):
-    """
-    A printer for dumping collected information about helpers as a ReStructured
-    Text page compatible with the rst2man program, which can be used to
-    generate a manual page for the helpers.
-    @helpers: array of Helper objects to print to standard output
-    """
-    def print_header(self):
-        header = '''\
-.. Copyright (C) All BPF authors and contributors from 2014 to present.
-.. See git log include/uapi/linux/bpf.h in kernel tree for details.
-.. 
-.. %%%LICENSE_START(VERBATIM)
-.. Permission is granted to make and distribute verbatim copies of this
-.. manual provided the copyright notice and this permission notice are
-.. preserved on all copies.
-.. 
-.. Permission is granted to copy and distribute modified versions of this
-.. manual under the conditions for verbatim copying, provided that the
-.. entire resulting derived work is distributed under the terms of a
-.. permission notice identical to this one.
-.. 
-.. Since the Linux kernel and libraries are constantly changing, this
-.. manual page may be incorrect or out-of-date.  The author(s) assume no
-.. responsibility for errors or omissions, or for damages resulting from
-.. the use of the information contained herein.  The author(s) may not
-.. have taken the same level of care in the production of this manual,
-.. which is licensed free of charge, as they might when working
-.. professionally.
-.. 
-.. Formatted or processed versions of this manual, if unaccompanied by
-.. the source, must acknowledge the copyright and authors of this work.
-.. %%%LICENSE_END
-.. 
-.. Please do not edit this file. It was generated from the documentation
-.. located in file include/uapi/linux/bpf.h of the Linux kernel sources
-.. (helpers description), and from scripts/bpf_helpers_doc.py in the same
-.. repository (header and footer).
-
-===========
-BPF-HELPERS
-===========
--------------------------------------------------------------------------------
-list of eBPF helper functions
--------------------------------------------------------------------------------
-
-:Manual section: 7
-
-DESCRIPTION
-===========
-
-The extended Berkeley Packet Filter (eBPF) subsystem consists in programs
-written in a pseudo-assembly language, then attached to one of the several
-kernel hooks and run in reaction of specific events. This framework differs
-from the older, "classic" BPF (or "cBPF") in several aspects, one of them being
-the ability to call special functions (or "helpers") from within a program.
-These functions are restricted to a white-list of helpers defined in the
-kernel.
-
-These helpers are used by eBPF programs to interact with the system, or with
-the context in which they work. For instance, they can be used to print
-debugging messages, to get the time since the system was booted, to interact
-with eBPF maps, or to manipulate network packets. Since there are several eBPF
-program types, and that they do not run in the same context, each program type
-can only call a subset of those helpers.
-
-Due to eBPF conventions, a helper can not have more than five arguments.
-
-Internally, eBPF programs call directly into the compiled helper functions
-without requiring any foreign-function interface. As a result, calling helpers
-introduces no overhead, thus offering excellent performance.
-
-This document is an attempt to list and document the helpers available to eBPF
-developers. They are sorted by chronological order (the oldest helpers in the
-kernel at the top).
-
-HELPERS
-=======
-'''
-        print(header)
-
-    def print_footer(self):
-        footer = '''
-EXAMPLES
-========
-
-Example usage for most of the eBPF helpers listed in this manual page are
-available within the Linux kernel sources, at the following locations:
-
-* *samples/bpf/*
-* *tools/testing/selftests/bpf/*
-
-LICENSE
-=======
-
-eBPF programs can have an associated license, passed along with the bytecode
-instructions to the kernel when the programs are loaded. The format for that
-string is identical to the one in use for kernel modules (Dual licenses, such
-as "Dual BSD/GPL", may be used). Some helper functions are only accessible to
-programs that are compatible with the GNU Privacy License (GPL).
-
-In order to use such helpers, the eBPF program must be loaded with the correct
-license string passed (via **attr**) to the **bpf**\ () system call, and this
-generally translates into the C source code of the program containing a line
-similar to the following:
-
-::
-
-	char ____license[] __attribute__((section("license"), used)) = "GPL";
-
-IMPLEMENTATION
-==============
-
-This manual page is an effort to document the existing eBPF helper functions.
-But as of this writing, the BPF sub-system is under heavy development. New eBPF
-program or map types are added, along with new helper functions. Some helpers
-are occasionally made available for additional program types. So in spite of
-the efforts of the community, this page might not be up-to-date. If you want to
-check by yourself what helper functions exist in your kernel, or what types of
-programs they can support, here are some files among the kernel tree that you
-may be interested in:
-
-* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list
-  of all helper functions, as well as many other BPF definitions including most
-  of the flags, structs or constants used by the helpers.
-* *net/core/filter.c* contains the definition of most network-related helper
-  functions, and the list of program types from which they can be used.
-* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related
-  helpers.
-* *kernel/bpf/verifier.c* contains the functions used to check that valid types
-  of eBPF maps are used with a given helper function.
-* *kernel/bpf/* directory contains other files in which additional helpers are
-  defined (for cgroups, sockmaps, etc.).
-* The bpftool utility can be used to probe the availability of helper functions
-  on the system (as well as supported program and map types, and a number of
-  other parameters). To do so, run **bpftool feature probe** (see
-  **bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to
-  list features available to unprivileged users.
-
-Compatibility between helper functions and program types can generally be found
-in the files where helper functions are defined. Look for the **struct
-bpf_func_proto** objects and for functions returning them: these functions
-contain a list of helpers that a given program type can call. Note that the
-**default:** label of the **switch ... case** used to filter helpers can call
-other functions, themselves allowing access to additional helpers. The
-requirement for GPL license is also in those **struct bpf_func_proto**.
-
-Compatibility between helper functions and map types can be found in the
-**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*.
-
-Helper functions that invalidate the checks on **data** and **data_end**
-pointers for network processing are listed in function
-**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*.
-
-SEE ALSO
-========
-
-**bpf**\ (2),
-**bpftool**\ (8),
-**cgroups**\ (7),
-**ip**\ (8),
-**perf_event_open**\ (2),
-**sendmsg**\ (2),
-**socket**\ (7),
-**tc-bpf**\ (8)'''
-        print(footer)
-
-    def print_proto(self, helper):
-        """
-        Format function protocol with bold and italics markers. This makes RST
-        file less readable, but gives nice results in the manual page.
-        """
-        proto = helper.proto_break_down()
-
-        print('**%s %s%s(' % (proto['ret_type'],
-                              proto['ret_star'].replace('*', '\\*'),
-                              proto['name']),
-              end='')
-
-        comma = ''
-        for a in proto['args']:
-            one_arg = '{}{}'.format(comma, a['type'])
-            if a['name']:
-                if a['star']:
-                    one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*'))
-                else:
-                    one_arg += '** '
-                one_arg += '*{}*\\ **'.format(a['name'])
-            comma = ', '
-            print(one_arg, end='')
-
-        print(')**')
-
-    def print_one(self, helper):
-        self.print_proto(helper)
-
-        if (helper.desc):
-            print('\tDescription')
-            # Do not strip all newline characters: formatted code at the end of
-            # a section must be followed by a blank line.
-            for line in re.sub('\n$', '', helper.desc, count=1).split('\n'):
-                print('{}{}'.format('\t\t' if line else '', line))
-
-        if (helper.ret):
-            print('\tReturn')
-            for line in helper.ret.rstrip().split('\n'):
-                print('{}{}'.format('\t\t' if line else '', line))
-
-        print('')
-
-class PrinterHelpers(Printer):
-    """
-    A printer for dumping collected information about helpers as C header to
-    be included from BPF program.
-    @helpers: array of Helper objects to print to standard output
-    """
-
-    type_fwds = [
-            'struct bpf_fib_lookup',
-            'struct bpf_sk_lookup',
-            'struct bpf_perf_event_data',
-            'struct bpf_perf_event_value',
-            'struct bpf_pidns_info',
-            'struct bpf_redir_neigh',
-            'struct bpf_sock',
-            'struct bpf_sock_addr',
-            'struct bpf_sock_ops',
-            'struct bpf_sock_tuple',
-            'struct bpf_spin_lock',
-            'struct bpf_sysctl',
-            'struct bpf_tcp_sock',
-            'struct bpf_tunnel_key',
-            'struct bpf_xfrm_state',
-            'struct linux_binprm',
-            'struct pt_regs',
-            'struct sk_reuseport_md',
-            'struct sockaddr',
-            'struct tcphdr',
-            'struct seq_file',
-            'struct tcp6_sock',
-            'struct tcp_sock',
-            'struct tcp_timewait_sock',
-            'struct tcp_request_sock',
-            'struct udp6_sock',
-            'struct task_struct',
-
-            'struct __sk_buff',
-            'struct sk_msg_md',
-            'struct xdp_md',
-            'struct path',
-            'struct btf_ptr',
-            'struct inode',
-            'struct socket',
-            'struct file',
-    ]
-    known_types = {
-            '...',
-            'void',
-            'const void',
-            'char',
-            'const char',
-            'int',
-            'long',
-            'unsigned long',
-
-            '__be16',
-            '__be32',
-            '__wsum',
-
-            'struct bpf_fib_lookup',
-            'struct bpf_perf_event_data',
-            'struct bpf_perf_event_value',
-            'struct bpf_pidns_info',
-            'struct bpf_redir_neigh',
-            'struct bpf_sk_lookup',
-            'struct bpf_sock',
-            'struct bpf_sock_addr',
-            'struct bpf_sock_ops',
-            'struct bpf_sock_tuple',
-            'struct bpf_spin_lock',
-            'struct bpf_sysctl',
-            'struct bpf_tcp_sock',
-            'struct bpf_tunnel_key',
-            'struct bpf_xfrm_state',
-            'struct linux_binprm',
-            'struct pt_regs',
-            'struct sk_reuseport_md',
-            'struct sockaddr',
-            'struct tcphdr',
-            'struct seq_file',
-            'struct tcp6_sock',
-            'struct tcp_sock',
-            'struct tcp_timewait_sock',
-            'struct tcp_request_sock',
-            'struct udp6_sock',
-            'struct task_struct',
-            'struct path',
-            'struct btf_ptr',
-            'struct inode',
-            'struct socket',
-            'struct file',
-    }
-    mapped_types = {
-            'u8': '__u8',
-            'u16': '__u16',
-            'u32': '__u32',
-            'u64': '__u64',
-            's8': '__s8',
-            's16': '__s16',
-            's32': '__s32',
-            's64': '__s64',
-            'size_t': 'unsigned long',
-            'struct bpf_map': 'void',
-            'struct sk_buff': 'struct __sk_buff',
-            'const struct sk_buff': 'const struct __sk_buff',
-            'struct sk_msg_buff': 'struct sk_msg_md',
-            'struct xdp_buff': 'struct xdp_md',
-    }
-    # Helpers overloaded for different context types.
-    overloaded_helpers = [
-        'bpf_get_socket_cookie',
-        'bpf_sk_assign',
-    ]
-
-    def print_header(self):
-        header = '''\
-/* This is auto-generated file. See bpf_helpers_doc.py for details. */
-
-/* Forward declarations of BPF structs */'''
-
-        print(header)
-        for fwd in self.type_fwds:
-            print('%s;' % fwd)
-        print('')
-
-    def print_footer(self):
-        footer = ''
-        print(footer)
-
-    def map_type(self, t):
-        if t in self.known_types:
-            return t
-        if t in self.mapped_types:
-            return self.mapped_types[t]
-        print("Unrecognized type '%s', please add it to known types!" % t,
-              file=sys.stderr)
-        sys.exit(1)
-
-    seen_helpers = set()
-
-    def print_one(self, helper):
-        proto = helper.proto_break_down()
-
-        if proto['name'] in self.seen_helpers:
-            return
-        self.seen_helpers.add(proto['name'])
-
-        print('/*')
-        print(" * %s" % proto['name'])
-        print(" *")
-        if (helper.desc):
-            # Do not strip all newline characters: formatted code at the end of
-            # a section must be followed by a blank line.
-            for line in re.sub('\n$', '', helper.desc, count=1).split('\n'):
-                print(' *{}{}'.format(' \t' if line else '', line))
-
-        if (helper.ret):
-            print(' *')
-            print(' * Returns')
-            for line in helper.ret.rstrip().split('\n'):
-                print(' *{}{}'.format(' \t' if line else '', line))
-
-        print(' */')
-        print('static %s %s(*%s)(' % (self.map_type(proto['ret_type']),
-                                      proto['ret_star'], proto['name']), end='')
-        comma = ''
-        for i, a in enumerate(proto['args']):
-            t = a['type']
-            n = a['name']
-            if proto['name'] in self.overloaded_helpers and i == 0:
-                    t = 'void'
-                    n = 'ctx'
-            one_arg = '{}{}'.format(comma, self.map_type(t))
-            if n:
-                if a['star']:
-                    one_arg += ' {}'.format(a['star'])
-                else:
-                    one_arg += ' '
-                one_arg += '{}'.format(n)
-            comma = ', '
-            print(one_arg, end='')
-
-        print(') = (void *) %d;' % len(self.seen_helpers))
-        print('')
-
-###############################################################################
-
-# If script is launched from scripts/ from kernel tree and can access
-# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse,
-# otherwise the --filename argument will be required from the command line.
-script = os.path.abspath(sys.argv[0])
-linuxRoot = os.path.dirname(os.path.dirname(script))
-bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h')
-
-argParser = argparse.ArgumentParser(description="""
-Parse eBPF header file and generate documentation for eBPF helper functions.
-The RST-formatted output produced can be turned into a manual page with the
-rst2man utility.
-""")
-argParser.add_argument('--header', action='store_true',
-                       help='generate C header file')
-if (os.path.isfile(bpfh)):
-    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h',
-                           default=bpfh)
-else:
-    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h')
-args = argParser.parse_args()
-
-# Parse file.
-headerParser = HeaderParser(args.filename)
-headerParser.run()
-
-# Print formatted output to standard output.
-if args.header:
-    printer = PrinterHelpers(headerParser.helpers)
-else:
-    printer = PrinterRST(headerParser.helpers)
-printer.print_all()
diff --git a/tools/bpf/Makefile.helpers b/tools/bpf/Makefile.helpers
index 854d084026dd..a26599022fd6 100644
--- a/tools/bpf/Makefile.helpers
+++ b/tools/bpf/Makefile.helpers
@@ -35,7 +35,7 @@ man7: $(DOC_MAN7)
 RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null)
 
 $(OUTPUT)$(HELPERS_RST): $(UP2DIR)../../include/uapi/linux/bpf.h
-	$(QUIET_GEN)$(UP2DIR)../../scripts/bpf_helpers_doc.py --filename $< > $@
+	$(QUIET_GEN)$(UP2DIR)../../scripts/bpf_doc.py --filename $< > $@
 
 $(OUTPUT)%.7: $(OUTPUT)%.rst
 ifndef RST2MAN_DEP
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b89af20cfa19..b4c5c529ad17 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -729,7 +729,7 @@ union bpf_attr {
  * parsed and used to produce a manual page. The workflow is the following,
  * and requires the rst2man utility:
  *
- *     $ ./scripts/bpf_helpers_doc.py \
+ *     $ ./scripts/bpf_doc.py \
  *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
  *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
  *     $ man /tmp/bpf-helpers.7
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 887a494ad5fc..8170f88e8ea6 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -158,7 +158,7 @@ $(BPF_IN_STATIC): force $(BPF_HELPER_DEFS)
 	$(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR)
 
 $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h
-	$(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \
+	$(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \
 		--file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS)
 
 $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index 5d7b947320fb..f05c4d48fd7e 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -20,4 +20,4 @@ tools/lib/bitmap.c
 tools/lib/str_error_r.c
 tools/lib/vsprintf.c
 tools/lib/zalloc.c
-scripts/bpf_helpers_doc.py
+scripts/bpf_doc.py
-- 
cgit v1.2.3-71-gd317


From 7c32e8f8bc33a5f4b113a630857e46634e3e143b Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Wed, 3 Mar 2021 10:18:13 +0000
Subject: bpf: Add PROG_TEST_RUN support for sk_lookup programs

Allow to pass sk_lookup programs to PROG_TEST_RUN. User space
provides the full bpf_sk_lookup struct as context. Since the
context includes a socket pointer that can't be exposed
to user space we define that PROG_TEST_RUN returns the cookie
of the selected socket or zero in place of the socket pointer.

We don't support testing programs that select a reuseport socket,
since this would mean running another (unrelated) BPF program
from the sk_lookup test handler.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210303101816.36774-3-lmb@cloudflare.com
---
 include/linux/bpf.h            |  10 ++++
 include/uapi/linux/bpf.h       |   5 +-
 net/bpf/test_run.c             | 105 +++++++++++++++++++++++++++++++++++++++++
 net/core/filter.c              |   1 +
 tools/include/uapi/linux/bpf.h |   5 +-
 5 files changed, 124 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4c730863fa77..c931bc97019d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1491,6 +1491,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 			     const union bpf_attr *kattr,
 			     union bpf_attr __user *uattr);
+int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
+				const union bpf_attr *kattr,
+				union bpf_attr __user *uattr);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info);
@@ -1692,6 +1695,13 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	return -ENOTSUPP;
 }
 
+static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
+					      const union bpf_attr *kattr,
+					      union bpf_attr __user *uattr)
+{
+	return -ENOTSUPP;
+}
+
 static inline void bpf_map_put(struct bpf_map *map)
 {
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 63a56ed6a785..7f530e349aff 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5953,7 +5953,10 @@ struct bpf_pidns_info {
 
 /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
 struct bpf_sk_lookup {
-	__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+	union {
+		__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+		__u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
+	};
 
 	__u32 family;		/* Protocol family (AF_INET, AF_INET6) */
 	__u32 protocol;		/* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index eb3c78cd4d7c..0abdd67f44b1 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -10,8 +10,10 @@
 #include <net/bpf_sk_storage.h>
 #include <net/sock.h>
 #include <net/tcp.h>
+#include <net/net_namespace.h>
 #include <linux/error-injection.h>
 #include <linux/smp.h>
+#include <linux/sock_diag.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/bpf_test_run.h>
@@ -781,3 +783,106 @@ out:
 	kfree(data);
 	return ret;
 }
+
+int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr,
+				union bpf_attr __user *uattr)
+{
+	struct bpf_test_timer t = { NO_PREEMPT };
+	struct bpf_prog_array *progs = NULL;
+	struct bpf_sk_lookup_kern ctx = {};
+	u32 repeat = kattr->test.repeat;
+	struct bpf_sk_lookup *user_ctx;
+	u32 retval, duration;
+	int ret = -EINVAL;
+
+	if (prog->type != BPF_PROG_TYPE_SK_LOOKUP)
+		return -EINVAL;
+
+	if (kattr->test.flags || kattr->test.cpu)
+		return -EINVAL;
+
+	if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out ||
+	    kattr->test.data_size_out)
+		return -EINVAL;
+
+	if (!repeat)
+		repeat = 1;
+
+	user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx));
+	if (IS_ERR(user_ctx))
+		return PTR_ERR(user_ctx);
+
+	if (!user_ctx)
+		return -EINVAL;
+
+	if (user_ctx->sk)
+		goto out;
+
+	if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx)))
+		goto out;
+
+	if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) {
+		ret = -ERANGE;
+		goto out;
+	}
+
+	ctx.family = (u16)user_ctx->family;
+	ctx.protocol = (u16)user_ctx->protocol;
+	ctx.dport = (u16)user_ctx->local_port;
+	ctx.sport = (__force __be16)user_ctx->remote_port;
+
+	switch (ctx.family) {
+	case AF_INET:
+		ctx.v4.daddr = (__force __be32)user_ctx->local_ip4;
+		ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4;
+		break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6;
+		ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6;
+		break;
+#endif
+
+	default:
+		ret = -EAFNOSUPPORT;
+		goto out;
+	}
+
+	progs = bpf_prog_array_alloc(1, GFP_KERNEL);
+	if (!progs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	progs->items[0].prog = prog;
+
+	bpf_test_timer_enter(&t);
+	do {
+		ctx.selected_sk = NULL;
+		retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN);
+	} while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
+	bpf_test_timer_leave(&t);
+
+	if (ret < 0)
+		goto out;
+
+	user_ctx->cookie = 0;
+	if (ctx.selected_sk) {
+		if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+
+		user_ctx->cookie = sock_gen_cookie(ctx.selected_sk);
+	}
+
+	ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration);
+	if (!ret)
+		ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx));
+
+out:
+	bpf_prog_array_free(progs);
+	kfree(user_ctx);
+	return ret;
+}
diff --git a/net/core/filter.c b/net/core/filter.c
index 13bcf248ee7b..a526db494c62 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10457,6 +10457,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
 }
 
 const struct bpf_prog_ops sk_lookup_prog_ops = {
+	.test_run = bpf_prog_test_run_sk_lookup,
 };
 
 const struct bpf_verifier_ops sk_lookup_verifier_ops = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 63a56ed6a785..7f530e349aff 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5953,7 +5953,10 @@ struct bpf_pidns_info {
 
 /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
 struct bpf_sk_lookup {
-	__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+	union {
+		__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+		__u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
+	};
 
 	__u32 family;		/* Protocol family (AF_INET, AF_INET6) */
 	__u32 protocol;		/* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
-- 
cgit v1.2.3-71-gd317


From d01b59c9ae94560fbcceaafeef39784d72765033 Mon Sep 17 00:00:00 2001
From: Xuesen Huang <huangxuesen@kuaishou.com>
Date: Thu, 4 Mar 2021 14:40:46 +0800
Subject: bpf: Add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH

bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
encapsulation. But that is not appropriate when pushing Ethernet header.

Add an option to further specify encap L2 type and set the inner_protocol
as ETH_P_TEB.

Suggested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Xuesen Huang <huangxuesen@kuaishou.com>
Signed-off-by: Zhiyong Cheng <chengzhiyong@kuaishou.com>
Signed-off-by: Li Wang <wangli09@kuaishou.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/bpf/20210304064046.6232-1-hxseverything@gmail.com
---
 include/uapi/linux/bpf.h       |  5 +++++
 net/core/filter.c              | 11 ++++++++++-
 tools/include/uapi/linux/bpf.h |  5 +++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7f530e349aff..2d3036e292a9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2484,6 +2484,10 @@ union bpf_attr {
  *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
  *		  type; *len* is the length of the inner MAC header.
  *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ *		  Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ *		  L2 type as Ethernet.
+ *
  * 		A call to this helper is susceptible to change the underlying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -4916,6 +4920,7 @@ enum {
 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
+	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
 };
 
 enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index a526db494c62..588b19ba0da8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3409,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
 					 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
 					 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
 					 BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+					 BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
 					 BPF_F_ADJ_ROOM_ENCAP_L2( \
 					  BPF_ADJ_ROOM_ENCAP_L2_MASK))
 
@@ -3445,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 		    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
 			return -EINVAL;
 
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+		    inner_mac_len < ETH_HLEN)
+			return -EINVAL;
+
 		if (skb->encapsulation)
 			return -EALREADY;
 
@@ -3463,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
 		skb->inner_mac_header = inner_net - inner_mac_len;
 		skb->inner_network_header = inner_net;
 		skb->inner_transport_header = inner_trans;
-		skb_set_inner_protocol(skb, skb->protocol);
+
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+			skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+		else
+			skb_set_inner_protocol(skb, skb->protocol);
 
 		skb->encapsulation = 1;
 		skb_set_network_header(skb, mac_len);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7f530e349aff..2d3036e292a9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2484,6 +2484,10 @@ union bpf_attr {
  *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
  *		  type; *len* is the length of the inner MAC header.
  *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ *		  Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ *		  L2 type as Ethernet.
+ *
  * 		A call to this helper is susceptible to change the underlying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -4916,6 +4920,7 @@ enum {
 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
+	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
 };
 
 enum {
-- 
cgit v1.2.3-71-gd317


From 0ae0337f929a970ee8d83e0e95e6b8d05562ce3b Mon Sep 17 00:00:00 2001
From: Anton Yakovlev <anton.yakovlev@opensynergy.com>
Date: Tue, 2 Mar 2021 17:47:01 +0100
Subject: uapi: virtio_ids: add a sound device type ID from OASIS spec

The OASIS virtio spec defines a sound device type ID that is not
present in the header yet.

Signed-off-by: Anton Yakovlev <anton.yakovlev@opensynergy.com>
Link: https://lore.kernel.org/r/20210302164709.3142702-2-anton.yakovlev@opensynergy.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/uapi/linux/virtio_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index bc1c0621f5ed..029a2e07a7f9 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -51,6 +51,7 @@
 #define VIRTIO_ID_PSTORE		22 /* virtio pstore device */
 #define VIRTIO_ID_IOMMU			23 /* virtio IOMMU */
 #define VIRTIO_ID_MEM			24 /* virtio mem */
+#define VIRTIO_ID_SOUND			25 /* virtio sound */
 #define VIRTIO_ID_FS			26 /* virtio filesystem */
 #define VIRTIO_ID_PMEM			27 /* virtio pmem */
 #define VIRTIO_ID_MAC80211_HWSIM	29 /* virtio mac80211-hwsim */
-- 
cgit v1.2.3-71-gd317


From de3a9980d8c34b2479173e809afa820473db676a Mon Sep 17 00:00:00 2001
From: Anton Yakovlev <anton.yakovlev@opensynergy.com>
Date: Tue, 2 Mar 2021 17:47:02 +0100
Subject: ALSA: virtio: add virtio sound driver

Introduce skeleton of the virtio sound driver. The driver implements
the virtio sound device specification, which has become part of the
virtio standard.

Initial initialization of the device, virtqueues and creation of an
empty ALSA sound device.

Signed-off-by: Anton Yakovlev <anton.yakovlev@opensynergy.com>
Link: https://lore.kernel.org/r/20210302164709.3142702-3-anton.yakovlev@opensynergy.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 MAINTAINERS                     |   9 ++
 include/uapi/linux/virtio_snd.h | 334 ++++++++++++++++++++++++++++++++++++++++
 sound/Kconfig                   |   2 +
 sound/Makefile                  |   3 +-
 sound/virtio/Kconfig            |  10 ++
 sound/virtio/Makefile           |   7 +
 sound/virtio/virtio_card.c      | 324 ++++++++++++++++++++++++++++++++++++++
 sound/virtio/virtio_card.h      |  65 ++++++++
 8 files changed, 753 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/virtio_snd.h
 create mode 100644 sound/virtio/Kconfig
 create mode 100644 sound/virtio/Makefile
 create mode 100644 sound/virtio/virtio_card.c
 create mode 100644 sound/virtio/virtio_card.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index d92f85ca831d..a68c543c28f1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19086,6 +19086,15 @@ W:	https://virtio-mem.gitlab.io/
 F:	drivers/virtio/virtio_mem.c
 F:	include/uapi/linux/virtio_mem.h
 
+VIRTIO SOUND DRIVER
+M:	Anton Yakovlev <anton.yakovlev@opensynergy.com>
+M:	"Michael S. Tsirkin" <mst@redhat.com>
+L:	virtualization@lists.linux-foundation.org
+L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
+S:	Maintained
+F:	include/uapi/linux/virtio_snd.h
+F:	sound/virtio/*
+
 VIRTUAL BOX GUEST DEVICE DRIVER
 M:	Hans de Goede <hdegoede@redhat.com>
 M:	Arnd Bergmann <arnd@arndb.de>
diff --git a/include/uapi/linux/virtio_snd.h b/include/uapi/linux/virtio_snd.h
new file mode 100644
index 000000000000..dfe49547a7b0
--- /dev/null
+++ b/include/uapi/linux/virtio_snd.h
@@ -0,0 +1,334 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/*
+ * Copyright (C) 2021 OpenSynergy GmbH
+ */
+#ifndef VIRTIO_SND_IF_H
+#define VIRTIO_SND_IF_H
+
+#include <linux/virtio_types.h>
+
+/*******************************************************************************
+ * CONFIGURATION SPACE
+ */
+struct virtio_snd_config {
+	/* # of available physical jacks */
+	__le32 jacks;
+	/* # of available PCM streams */
+	__le32 streams;
+	/* # of available channel maps */
+	__le32 chmaps;
+};
+
+enum {
+	/* device virtqueue indexes */
+	VIRTIO_SND_VQ_CONTROL = 0,
+	VIRTIO_SND_VQ_EVENT,
+	VIRTIO_SND_VQ_TX,
+	VIRTIO_SND_VQ_RX,
+	/* # of device virtqueues */
+	VIRTIO_SND_VQ_MAX
+};
+
+/*******************************************************************************
+ * COMMON DEFINITIONS
+ */
+
+/* supported dataflow directions */
+enum {
+	VIRTIO_SND_D_OUTPUT = 0,
+	VIRTIO_SND_D_INPUT
+};
+
+enum {
+	/* jack control request types */
+	VIRTIO_SND_R_JACK_INFO = 1,
+	VIRTIO_SND_R_JACK_REMAP,
+
+	/* PCM control request types */
+	VIRTIO_SND_R_PCM_INFO = 0x0100,
+	VIRTIO_SND_R_PCM_SET_PARAMS,
+	VIRTIO_SND_R_PCM_PREPARE,
+	VIRTIO_SND_R_PCM_RELEASE,
+	VIRTIO_SND_R_PCM_START,
+	VIRTIO_SND_R_PCM_STOP,
+
+	/* channel map control request types */
+	VIRTIO_SND_R_CHMAP_INFO = 0x0200,
+
+	/* jack event types */
+	VIRTIO_SND_EVT_JACK_CONNECTED = 0x1000,
+	VIRTIO_SND_EVT_JACK_DISCONNECTED,
+
+	/* PCM event types */
+	VIRTIO_SND_EVT_PCM_PERIOD_ELAPSED = 0x1100,
+	VIRTIO_SND_EVT_PCM_XRUN,
+
+	/* common status codes */
+	VIRTIO_SND_S_OK = 0x8000,
+	VIRTIO_SND_S_BAD_MSG,
+	VIRTIO_SND_S_NOT_SUPP,
+	VIRTIO_SND_S_IO_ERR
+};
+
+/* common header */
+struct virtio_snd_hdr {
+	__le32 code;
+};
+
+/* event notification */
+struct virtio_snd_event {
+	/* VIRTIO_SND_EVT_XXX */
+	struct virtio_snd_hdr hdr;
+	/* optional event data */
+	__le32 data;
+};
+
+/* common control request to query an item information */
+struct virtio_snd_query_info {
+	/* VIRTIO_SND_R_XXX_INFO */
+	struct virtio_snd_hdr hdr;
+	/* item start identifier */
+	__le32 start_id;
+	/* item count to query */
+	__le32 count;
+	/* item information size in bytes */
+	__le32 size;
+};
+
+/* common item information header */
+struct virtio_snd_info {
+	/* function group node id (High Definition Audio Specification 7.1.2) */
+	__le32 hda_fn_nid;
+};
+
+/*******************************************************************************
+ * JACK CONTROL MESSAGES
+ */
+struct virtio_snd_jack_hdr {
+	/* VIRTIO_SND_R_JACK_XXX */
+	struct virtio_snd_hdr hdr;
+	/* 0 ... virtio_snd_config::jacks - 1 */
+	__le32 jack_id;
+};
+
+/* supported jack features */
+enum {
+	VIRTIO_SND_JACK_F_REMAP = 0
+};
+
+struct virtio_snd_jack_info {
+	/* common header */
+	struct virtio_snd_info hdr;
+	/* supported feature bit map (1 << VIRTIO_SND_JACK_F_XXX) */
+	__le32 features;
+	/* pin configuration (High Definition Audio Specification 7.3.3.31) */
+	__le32 hda_reg_defconf;
+	/* pin capabilities (High Definition Audio Specification 7.3.4.9) */
+	__le32 hda_reg_caps;
+	/* current jack connection status (0: disconnected, 1: connected) */
+	__u8 connected;
+
+	__u8 padding[7];
+};
+
+/* jack remapping control request */
+struct virtio_snd_jack_remap {
+	/* .code = VIRTIO_SND_R_JACK_REMAP */
+	struct virtio_snd_jack_hdr hdr;
+	/* selected association number */
+	__le32 association;
+	/* selected sequence number */
+	__le32 sequence;
+};
+
+/*******************************************************************************
+ * PCM CONTROL MESSAGES
+ */
+struct virtio_snd_pcm_hdr {
+	/* VIRTIO_SND_R_PCM_XXX */
+	struct virtio_snd_hdr hdr;
+	/* 0 ... virtio_snd_config::streams - 1 */
+	__le32 stream_id;
+};
+
+/* supported PCM stream features */
+enum {
+	VIRTIO_SND_PCM_F_SHMEM_HOST = 0,
+	VIRTIO_SND_PCM_F_SHMEM_GUEST,
+	VIRTIO_SND_PCM_F_MSG_POLLING,
+	VIRTIO_SND_PCM_F_EVT_SHMEM_PERIODS,
+	VIRTIO_SND_PCM_F_EVT_XRUNS
+};
+
+/* supported PCM sample formats */
+enum {
+	/* analog formats (width / physical width) */
+	VIRTIO_SND_PCM_FMT_IMA_ADPCM = 0,	/*  4 /  4 bits */
+	VIRTIO_SND_PCM_FMT_MU_LAW,		/*  8 /  8 bits */
+	VIRTIO_SND_PCM_FMT_A_LAW,		/*  8 /  8 bits */
+	VIRTIO_SND_PCM_FMT_S8,			/*  8 /  8 bits */
+	VIRTIO_SND_PCM_FMT_U8,			/*  8 /  8 bits */
+	VIRTIO_SND_PCM_FMT_S16,			/* 16 / 16 bits */
+	VIRTIO_SND_PCM_FMT_U16,			/* 16 / 16 bits */
+	VIRTIO_SND_PCM_FMT_S18_3,		/* 18 / 24 bits */
+	VIRTIO_SND_PCM_FMT_U18_3,		/* 18 / 24 bits */
+	VIRTIO_SND_PCM_FMT_S20_3,		/* 20 / 24 bits */
+	VIRTIO_SND_PCM_FMT_U20_3,		/* 20 / 24 bits */
+	VIRTIO_SND_PCM_FMT_S24_3,		/* 24 / 24 bits */
+	VIRTIO_SND_PCM_FMT_U24_3,		/* 24 / 24 bits */
+	VIRTIO_SND_PCM_FMT_S20,			/* 20 / 32 bits */
+	VIRTIO_SND_PCM_FMT_U20,			/* 20 / 32 bits */
+	VIRTIO_SND_PCM_FMT_S24,			/* 24 / 32 bits */
+	VIRTIO_SND_PCM_FMT_U24,			/* 24 / 32 bits */
+	VIRTIO_SND_PCM_FMT_S32,			/* 32 / 32 bits */
+	VIRTIO_SND_PCM_FMT_U32,			/* 32 / 32 bits */
+	VIRTIO_SND_PCM_FMT_FLOAT,		/* 32 / 32 bits */
+	VIRTIO_SND_PCM_FMT_FLOAT64,		/* 64 / 64 bits */
+	/* digital formats (width / physical width) */
+	VIRTIO_SND_PCM_FMT_DSD_U8,		/*  8 /  8 bits */
+	VIRTIO_SND_PCM_FMT_DSD_U16,		/* 16 / 16 bits */
+	VIRTIO_SND_PCM_FMT_DSD_U32,		/* 32 / 32 bits */
+	VIRTIO_SND_PCM_FMT_IEC958_SUBFRAME	/* 32 / 32 bits */
+};
+
+/* supported PCM frame rates */
+enum {
+	VIRTIO_SND_PCM_RATE_5512 = 0,
+	VIRTIO_SND_PCM_RATE_8000,
+	VIRTIO_SND_PCM_RATE_11025,
+	VIRTIO_SND_PCM_RATE_16000,
+	VIRTIO_SND_PCM_RATE_22050,
+	VIRTIO_SND_PCM_RATE_32000,
+	VIRTIO_SND_PCM_RATE_44100,
+	VIRTIO_SND_PCM_RATE_48000,
+	VIRTIO_SND_PCM_RATE_64000,
+	VIRTIO_SND_PCM_RATE_88200,
+	VIRTIO_SND_PCM_RATE_96000,
+	VIRTIO_SND_PCM_RATE_176400,
+	VIRTIO_SND_PCM_RATE_192000,
+	VIRTIO_SND_PCM_RATE_384000
+};
+
+struct virtio_snd_pcm_info {
+	/* common header */
+	struct virtio_snd_info hdr;
+	/* supported feature bit map (1 << VIRTIO_SND_PCM_F_XXX) */
+	__le32 features;
+	/* supported sample format bit map (1 << VIRTIO_SND_PCM_FMT_XXX) */
+	__le64 formats;
+	/* supported frame rate bit map (1 << VIRTIO_SND_PCM_RATE_XXX) */
+	__le64 rates;
+	/* dataflow direction (VIRTIO_SND_D_XXX) */
+	__u8 direction;
+	/* minimum # of supported channels */
+	__u8 channels_min;
+	/* maximum # of supported channels */
+	__u8 channels_max;
+
+	__u8 padding[5];
+};
+
+/* set PCM stream format */
+struct virtio_snd_pcm_set_params {
+	/* .code = VIRTIO_SND_R_PCM_SET_PARAMS */
+	struct virtio_snd_pcm_hdr hdr;
+	/* size of the hardware buffer */
+	__le32 buffer_bytes;
+	/* size of the hardware period */
+	__le32 period_bytes;
+	/* selected feature bit map (1 << VIRTIO_SND_PCM_F_XXX) */
+	__le32 features;
+	/* selected # of channels */
+	__u8 channels;
+	/* selected sample format (VIRTIO_SND_PCM_FMT_XXX) */
+	__u8 format;
+	/* selected frame rate (VIRTIO_SND_PCM_RATE_XXX) */
+	__u8 rate;
+
+	__u8 padding;
+};
+
+/*******************************************************************************
+ * PCM I/O MESSAGES
+ */
+
+/* I/O request header */
+struct virtio_snd_pcm_xfer {
+	/* 0 ... virtio_snd_config::streams - 1 */
+	__le32 stream_id;
+};
+
+/* I/O request status */
+struct virtio_snd_pcm_status {
+	/* VIRTIO_SND_S_XXX */
+	__le32 status;
+	/* current device latency */
+	__le32 latency_bytes;
+};
+
+/*******************************************************************************
+ * CHANNEL MAP CONTROL MESSAGES
+ */
+struct virtio_snd_chmap_hdr {
+	/* VIRTIO_SND_R_CHMAP_XXX */
+	struct virtio_snd_hdr hdr;
+	/* 0 ... virtio_snd_config::chmaps - 1 */
+	__le32 chmap_id;
+};
+
+/* standard channel position definition */
+enum {
+	VIRTIO_SND_CHMAP_NONE = 0,	/* undefined */
+	VIRTIO_SND_CHMAP_NA,		/* silent */
+	VIRTIO_SND_CHMAP_MONO,		/* mono stream */
+	VIRTIO_SND_CHMAP_FL,		/* front left */
+	VIRTIO_SND_CHMAP_FR,		/* front right */
+	VIRTIO_SND_CHMAP_RL,		/* rear left */
+	VIRTIO_SND_CHMAP_RR,		/* rear right */
+	VIRTIO_SND_CHMAP_FC,		/* front center */
+	VIRTIO_SND_CHMAP_LFE,		/* low frequency (LFE) */
+	VIRTIO_SND_CHMAP_SL,		/* side left */
+	VIRTIO_SND_CHMAP_SR,		/* side right */
+	VIRTIO_SND_CHMAP_RC,		/* rear center */
+	VIRTIO_SND_CHMAP_FLC,		/* front left center */
+	VIRTIO_SND_CHMAP_FRC,		/* front right center */
+	VIRTIO_SND_CHMAP_RLC,		/* rear left center */
+	VIRTIO_SND_CHMAP_RRC,		/* rear right center */
+	VIRTIO_SND_CHMAP_FLW,		/* front left wide */
+	VIRTIO_SND_CHMAP_FRW,		/* front right wide */
+	VIRTIO_SND_CHMAP_FLH,		/* front left high */
+	VIRTIO_SND_CHMAP_FCH,		/* front center high */
+	VIRTIO_SND_CHMAP_FRH,		/* front right high */
+	VIRTIO_SND_CHMAP_TC,		/* top center */
+	VIRTIO_SND_CHMAP_TFL,		/* top front left */
+	VIRTIO_SND_CHMAP_TFR,		/* top front right */
+	VIRTIO_SND_CHMAP_TFC,		/* top front center */
+	VIRTIO_SND_CHMAP_TRL,		/* top rear left */
+	VIRTIO_SND_CHMAP_TRR,		/* top rear right */
+	VIRTIO_SND_CHMAP_TRC,		/* top rear center */
+	VIRTIO_SND_CHMAP_TFLC,		/* top front left center */
+	VIRTIO_SND_CHMAP_TFRC,		/* top front right center */
+	VIRTIO_SND_CHMAP_TSL,		/* top side left */
+	VIRTIO_SND_CHMAP_TSR,		/* top side right */
+	VIRTIO_SND_CHMAP_LLFE,		/* left LFE */
+	VIRTIO_SND_CHMAP_RLFE,		/* right LFE */
+	VIRTIO_SND_CHMAP_BC,		/* bottom center */
+	VIRTIO_SND_CHMAP_BLC,		/* bottom left center */
+	VIRTIO_SND_CHMAP_BRC		/* bottom right center */
+};
+
+/* maximum possible number of channels */
+#define VIRTIO_SND_CHMAP_MAX_SIZE	18
+
+struct virtio_snd_chmap_info {
+	/* common header */
+	struct virtio_snd_info hdr;
+	/* dataflow direction (VIRTIO_SND_D_XXX) */
+	__u8 direction;
+	/* # of valid channel position values */
+	__u8 channels;
+	/* channel position values (VIRTIO_SND_CHMAP_XXX) */
+	__u8 positions[VIRTIO_SND_CHMAP_MAX_SIZE];
+};
+
+#endif /* VIRTIO_SND_IF_H */
diff --git a/sound/Kconfig b/sound/Kconfig
index 36785410fbe1..e56d96d2b11c 100644
--- a/sound/Kconfig
+++ b/sound/Kconfig
@@ -99,6 +99,8 @@ source "sound/synth/Kconfig"
 
 source "sound/xen/Kconfig"
 
+source "sound/virtio/Kconfig"
+
 endif # SND
 
 endif # !UML
diff --git a/sound/Makefile b/sound/Makefile
index 797ecdcd35e2..04ef04b1168f 100644
--- a/sound/Makefile
+++ b/sound/Makefile
@@ -5,7 +5,8 @@
 obj-$(CONFIG_SOUND) += soundcore.o
 obj-$(CONFIG_DMASOUND) += oss/dmasound/
 obj-$(CONFIG_SND) += core/ i2c/ drivers/ isa/ pci/ ppc/ arm/ sh/ synth/ usb/ \
-	firewire/ sparc/ spi/ parisc/ pcmcia/ mips/ soc/ atmel/ hda/ x86/ xen/
+	firewire/ sparc/ spi/ parisc/ pcmcia/ mips/ soc/ atmel/ hda/ x86/ xen/ \
+	virtio/
 obj-$(CONFIG_SND_AOA) += aoa/
 
 # This one must be compilable even if sound is configured out
diff --git a/sound/virtio/Kconfig b/sound/virtio/Kconfig
new file mode 100644
index 000000000000..094cba24ee5b
--- /dev/null
+++ b/sound/virtio/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0+
+# Sound card driver for virtio
+
+config SND_VIRTIO
+	tristate "Virtio sound driver"
+	depends on VIRTIO
+	select SND_PCM
+	select SND_JACK
+	help
+          This is the virtual sound driver for virtio. Say Y or M.
diff --git a/sound/virtio/Makefile b/sound/virtio/Makefile
new file mode 100644
index 000000000000..8c87ebb9982b
--- /dev/null
+++ b/sound/virtio/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0+
+
+obj-$(CONFIG_SND_VIRTIO) += virtio_snd.o
+
+virtio_snd-objs := \
+	virtio_card.o
+
diff --git a/sound/virtio/virtio_card.c b/sound/virtio/virtio_card.c
new file mode 100644
index 000000000000..5a37056858e9
--- /dev/null
+++ b/sound/virtio/virtio_card.c
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * virtio-snd: Virtio sound device
+ * Copyright (C) 2021 OpenSynergy GmbH
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/virtio_config.h>
+#include <sound/initval.h>
+#include <uapi/linux/virtio_ids.h>
+
+#include "virtio_card.h"
+
+static void virtsnd_remove(struct virtio_device *vdev);
+
+/**
+ * virtsnd_event_send() - Add an event to the event queue.
+ * @vqueue: Underlying event virtqueue.
+ * @event: Event.
+ * @notify: Indicates whether or not to send a notification to the device.
+ * @gfp: Kernel flags for memory allocation.
+ *
+ * Context: Any context.
+ */
+static void virtsnd_event_send(struct virtqueue *vqueue,
+			       struct virtio_snd_event *event, bool notify,
+			       gfp_t gfp)
+{
+	struct scatterlist sg;
+	struct scatterlist *psgs[1] = { &sg };
+
+	/* reset event content */
+	memset(event, 0, sizeof(*event));
+
+	sg_init_one(&sg, event, sizeof(*event));
+
+	if (virtqueue_add_sgs(vqueue, psgs, 0, 1, event, gfp) || !notify)
+		return;
+
+	if (virtqueue_kick_prepare(vqueue))
+		virtqueue_notify(vqueue);
+}
+
+/**
+ * virtsnd_event_dispatch() - Dispatch an event from the device side.
+ * @snd: VirtIO sound device.
+ * @event: VirtIO sound event.
+ *
+ * Context: Any context.
+ */
+static void virtsnd_event_dispatch(struct virtio_snd *snd,
+				   struct virtio_snd_event *event)
+{
+}
+
+/**
+ * virtsnd_event_notify_cb() - Dispatch all reported events from the event queue.
+ * @vqueue: Underlying event virtqueue.
+ *
+ * This callback function is called upon a vring interrupt request from the
+ * device.
+ *
+ * Context: Interrupt context.
+ */
+static void virtsnd_event_notify_cb(struct virtqueue *vqueue)
+{
+	struct virtio_snd *snd = vqueue->vdev->priv;
+	struct virtio_snd_queue *queue = virtsnd_event_queue(snd);
+	struct virtio_snd_event *event;
+	u32 length;
+	unsigned long flags;
+
+	spin_lock_irqsave(&queue->lock, flags);
+	do {
+		virtqueue_disable_cb(vqueue);
+		while ((event = virtqueue_get_buf(vqueue, &length))) {
+			virtsnd_event_dispatch(snd, event);
+			virtsnd_event_send(vqueue, event, true, GFP_ATOMIC);
+		}
+		if (unlikely(virtqueue_is_broken(vqueue)))
+			break;
+	} while (!virtqueue_enable_cb(vqueue));
+	spin_unlock_irqrestore(&queue->lock, flags);
+}
+
+/**
+ * virtsnd_find_vqs() - Enumerate and initialize all virtqueues.
+ * @snd: VirtIO sound device.
+ *
+ * After calling this function, the event queue is disabled.
+ *
+ * Context: Any context.
+ * Return: 0 on success, -errno on failure.
+ */
+static int virtsnd_find_vqs(struct virtio_snd *snd)
+{
+	struct virtio_device *vdev = snd->vdev;
+	static vq_callback_t *callbacks[VIRTIO_SND_VQ_MAX] = {
+		[VIRTIO_SND_VQ_EVENT] = virtsnd_event_notify_cb
+	};
+	static const char *names[VIRTIO_SND_VQ_MAX] = {
+		[VIRTIO_SND_VQ_EVENT] = "virtsnd-event"
+	};
+	struct virtqueue *vqs[VIRTIO_SND_VQ_MAX] = { 0 };
+	unsigned int i;
+	unsigned int n;
+	int rc;
+
+	rc = virtio_find_vqs(vdev, VIRTIO_SND_VQ_MAX, vqs, callbacks, names,
+			     NULL);
+	if (rc) {
+		dev_err(&vdev->dev, "failed to initialize virtqueues\n");
+		return rc;
+	}
+
+	for (i = 0; i < VIRTIO_SND_VQ_MAX; ++i)
+		snd->queues[i].vqueue = vqs[i];
+
+	/* Allocate events and populate the event queue */
+	virtqueue_disable_cb(vqs[VIRTIO_SND_VQ_EVENT]);
+
+	n = virtqueue_get_vring_size(vqs[VIRTIO_SND_VQ_EVENT]);
+
+	snd->event_msgs = kmalloc_array(n, sizeof(*snd->event_msgs),
+					GFP_KERNEL);
+	if (!snd->event_msgs)
+		return -ENOMEM;
+
+	for (i = 0; i < n; ++i)
+		virtsnd_event_send(vqs[VIRTIO_SND_VQ_EVENT],
+				   &snd->event_msgs[i], false, GFP_KERNEL);
+
+	return 0;
+}
+
+/**
+ * virtsnd_enable_event_vq() - Enable the event virtqueue.
+ * @snd: VirtIO sound device.
+ *
+ * Context: Any context.
+ */
+static void virtsnd_enable_event_vq(struct virtio_snd *snd)
+{
+	struct virtio_snd_queue *queue = virtsnd_event_queue(snd);
+
+	if (!virtqueue_enable_cb(queue->vqueue))
+		virtsnd_event_notify_cb(queue->vqueue);
+}
+
+/**
+ * virtsnd_disable_event_vq() - Disable the event virtqueue.
+ * @snd: VirtIO sound device.
+ *
+ * Context: Any context.
+ */
+static void virtsnd_disable_event_vq(struct virtio_snd *snd)
+{
+	struct virtio_snd_queue *queue = virtsnd_event_queue(snd);
+	struct virtio_snd_event *event;
+	u32 length;
+	unsigned long flags;
+
+	if (queue->vqueue) {
+		spin_lock_irqsave(&queue->lock, flags);
+		virtqueue_disable_cb(queue->vqueue);
+		while ((event = virtqueue_get_buf(queue->vqueue, &length)))
+			virtsnd_event_dispatch(snd, event);
+		spin_unlock_irqrestore(&queue->lock, flags);
+	}
+}
+
+/**
+ * virtsnd_build_devs() - Read configuration and build ALSA devices.
+ * @snd: VirtIO sound device.
+ *
+ * Context: Any context that permits to sleep.
+ * Return: 0 on success, -errno on failure.
+ */
+static int virtsnd_build_devs(struct virtio_snd *snd)
+{
+	struct virtio_device *vdev = snd->vdev;
+	struct device *dev = &vdev->dev;
+	int rc;
+
+	rc = snd_card_new(dev, SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1,
+			  THIS_MODULE, 0, &snd->card);
+	if (rc < 0)
+		return rc;
+
+	snd->card->private_data = snd;
+
+	strscpy(snd->card->driver, VIRTIO_SND_CARD_DRIVER,
+		sizeof(snd->card->driver));
+	strscpy(snd->card->shortname, VIRTIO_SND_CARD_NAME,
+		sizeof(snd->card->shortname));
+	if (dev->parent->bus)
+		snprintf(snd->card->longname, sizeof(snd->card->longname),
+			 VIRTIO_SND_CARD_NAME " at %s/%s/%s",
+			 dev->parent->bus->name, dev_name(dev->parent),
+			 dev_name(dev));
+	else
+		snprintf(snd->card->longname, sizeof(snd->card->longname),
+			 VIRTIO_SND_CARD_NAME " at %s/%s",
+			 dev_name(dev->parent), dev_name(dev));
+
+	return snd_card_register(snd->card);
+}
+
+/**
+ * virtsnd_validate() - Validate if the device can be started.
+ * @vdev: VirtIO parent device.
+ *
+ * Context: Any context.
+ * Return: 0 on success, -EINVAL on failure.
+ */
+static int virtsnd_validate(struct virtio_device *vdev)
+{
+	if (!vdev->config->get) {
+		dev_err(&vdev->dev, "configuration access disabled\n");
+		return -EINVAL;
+	}
+
+	if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+		dev_err(&vdev->dev,
+			"device does not comply with spec version 1.x\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * virtsnd_probe() - Create and initialize the device.
+ * @vdev: VirtIO parent device.
+ *
+ * Context: Any context that permits to sleep.
+ * Return: 0 on success, -errno on failure.
+ */
+static int virtsnd_probe(struct virtio_device *vdev)
+{
+	struct virtio_snd *snd;
+	unsigned int i;
+	int rc;
+
+	snd = devm_kzalloc(&vdev->dev, sizeof(*snd), GFP_KERNEL);
+	if (!snd)
+		return -ENOMEM;
+
+	snd->vdev = vdev;
+
+	vdev->priv = snd;
+
+	for (i = 0; i < VIRTIO_SND_VQ_MAX; ++i)
+		spin_lock_init(&snd->queues[i].lock);
+
+	rc = virtsnd_find_vqs(snd);
+	if (rc)
+		goto on_exit;
+
+	virtio_device_ready(vdev);
+
+	rc = virtsnd_build_devs(snd);
+	if (rc)
+		goto on_exit;
+
+	virtsnd_enable_event_vq(snd);
+
+on_exit:
+	if (rc)
+		virtsnd_remove(vdev);
+
+	return rc;
+}
+
+/**
+ * virtsnd_remove() - Remove VirtIO and ALSA devices.
+ * @vdev: VirtIO parent device.
+ *
+ * Context: Any context that permits to sleep.
+ */
+static void virtsnd_remove(struct virtio_device *vdev)
+{
+	struct virtio_snd *snd = vdev->priv;
+
+	virtsnd_disable_event_vq(snd);
+
+	if (snd->card)
+		snd_card_free(snd->card);
+
+	vdev->config->del_vqs(vdev);
+	vdev->config->reset(vdev);
+
+	kfree(snd->event_msgs);
+}
+
+static const struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_SOUND, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static struct virtio_driver virtsnd_driver = {
+	.driver.name = KBUILD_MODNAME,
+	.driver.owner = THIS_MODULE,
+	.id_table = id_table,
+	.validate = virtsnd_validate,
+	.probe = virtsnd_probe,
+	.remove = virtsnd_remove,
+};
+
+static int __init init(void)
+{
+	return register_virtio_driver(&virtsnd_driver);
+}
+module_init(init);
+
+static void __exit fini(void)
+{
+	unregister_virtio_driver(&virtsnd_driver);
+}
+module_exit(fini);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio sound card driver");
+MODULE_LICENSE("GPL");
diff --git a/sound/virtio/virtio_card.h b/sound/virtio/virtio_card.h
new file mode 100644
index 000000000000..b903b1b12e90
--- /dev/null
+++ b/sound/virtio/virtio_card.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * virtio-snd: Virtio sound device
+ * Copyright (C) 2021 OpenSynergy GmbH
+ */
+#ifndef VIRTIO_SND_CARD_H
+#define VIRTIO_SND_CARD_H
+
+#include <linux/slab.h>
+#include <linux/virtio.h>
+#include <sound/core.h>
+#include <uapi/linux/virtio_snd.h>
+
+#define VIRTIO_SND_CARD_DRIVER	"virtio-snd"
+#define VIRTIO_SND_CARD_NAME	"VirtIO SoundCard"
+
+/**
+ * struct virtio_snd_queue - Virtqueue wrapper structure.
+ * @lock: Used to synchronize access to a virtqueue.
+ * @vqueue: Underlying virtqueue.
+ */
+struct virtio_snd_queue {
+	spinlock_t lock;
+	struct virtqueue *vqueue;
+};
+
+/**
+ * struct virtio_snd - VirtIO sound card device.
+ * @vdev: Underlying virtio device.
+ * @queues: Virtqueue wrappers.
+ * @card: ALSA sound card.
+ * @event_msgs: Device events.
+ */
+struct virtio_snd {
+	struct virtio_device *vdev;
+	struct virtio_snd_queue queues[VIRTIO_SND_VQ_MAX];
+	struct snd_card *card;
+	struct virtio_snd_event *event_msgs;
+};
+
+static inline struct virtio_snd_queue *
+virtsnd_control_queue(struct virtio_snd *snd)
+{
+	return &snd->queues[VIRTIO_SND_VQ_CONTROL];
+}
+
+static inline struct virtio_snd_queue *
+virtsnd_event_queue(struct virtio_snd *snd)
+{
+	return &snd->queues[VIRTIO_SND_VQ_EVENT];
+}
+
+static inline struct virtio_snd_queue *
+virtsnd_tx_queue(struct virtio_snd *snd)
+{
+	return &snd->queues[VIRTIO_SND_VQ_TX];
+}
+
+static inline struct virtio_snd_queue *
+virtsnd_rx_queue(struct virtio_snd *snd)
+{
+	return &snd->queues[VIRTIO_SND_VQ_RX];
+}
+
+#endif /* VIRTIO_SND_CARD_H */
-- 
cgit v1.2.3-71-gd317


From e5e35e754c28724d5c619f2ec805fd221f8d59ce Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 8 Mar 2021 15:59:18 +0100
Subject: bpf: BPF-helper for MTU checking add length input

The FIB lookup example[1] show how the IP-header field tot_len
(iph->tot_len) is used as input to perform the MTU check.

This patch extend the BPF-helper bpf_check_mtu() with the same ability
to provide the length as user parameter input, via mtu_len parameter.

This still needs to be done before the bpf_check_mtu() helper API
becomes frozen.

  [1] samples/bpf/xdp_fwd_kern.c

Fixes: 34b2021cc616 ("bpf: Add BPF-helper for MTU checking")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/161521555850.3515614.6533850861569774444.stgit@firesoul
---
 include/uapi/linux/bpf.h | 16 +++++++++++-----
 net/core/filter.c        | 12 ++++++++++--
 2 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 79c893310492..4ba4ef0ff63a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3850,7 +3850,7 @@ union bpf_attr {
  *
  * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags)
  *	Description
- *		Check ctx packet size against exceeding MTU of net device (based
+ *		Check packet size against exceeding MTU of net device (based
  *		on *ifindex*).  This helper will likely be used in combination
  *		with helpers that adjust/change the packet size.
  *
@@ -3867,6 +3867,14 @@ union bpf_attr {
  *		against the current net device.  This is practical if this isn't
  *		used prior to redirect.
  *
+ *		On input *mtu_len* must be a valid pointer, else verifier will
+ *		reject BPF program.  If the value *mtu_len* is initialized to
+ *		zero then the ctx packet size is use.  When value *mtu_len* is
+ *		provided as input this specify the L3 length that the MTU check
+ *		is done against. Remember XDP and TC length operate at L2, but
+ *		this value is L3 as this correlate to MTU and IP-header tot_len
+ *		values which are L3 (similar behavior as bpf_fib_lookup).
+ *
  *		The Linux kernel route table can configure MTUs on a more
  *		specific per route level, which is not provided by this helper.
  *		For route level MTU checks use the **bpf_fib_lookup**\ ()
@@ -3891,11 +3899,9 @@ union bpf_attr {
  *
  *		On return *mtu_len* pointer contains the MTU value of the net
  *		device.  Remember the net device configured MTU is the L3 size,
- *		which is returned here and XDP and TX length operate at L2.
+ *		which is returned here and XDP and TC length operate at L2.
  *		Helper take this into account for you, but remember when using
- *		MTU value in your BPF-code.  On input *mtu_len* must be a valid
- *		pointer and be initialized (to zero), else verifier will reject
- *		BPF program.
+ *		MTU value in your BPF-code.
  *
  *	Return
  *		* 0 on success, and populate MTU value in *mtu_len* pointer.
diff --git a/net/core/filter.c b/net/core/filter.c
index adfdad234674..9323d34d34cc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5658,7 +5658,7 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
 	if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
 		return -EINVAL;
 
-	if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff))
+	if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
 		return -EINVAL;
 
 	dev = __dev_via_ifindex(dev, ifindex);
@@ -5668,7 +5668,11 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
 	mtu = READ_ONCE(dev->mtu);
 
 	dev_len = mtu + dev->hard_header_len;
-	skb_len = skb->len + len_diff; /* minus result pass check */
+
+	/* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+	skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;
+
+	skb_len += len_diff; /* minus result pass check */
 	if (skb_len <= dev_len) {
 		ret = BPF_MTU_CHK_RET_SUCCESS;
 		goto out;
@@ -5713,6 +5717,10 @@ BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
 	/* Add L2-header as dev MTU is L3 size */
 	dev_len = mtu + dev->hard_header_len;
 
+	/* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+	if (*mtu_len)
+		xdp_len = *mtu_len + dev->hard_header_len;
+
 	xdp_len += len_diff; /* minus result pass check */
 	if (xdp_len > dev_len)
 		ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
-- 
cgit v1.2.3-71-gd317


From f76edd8f7ce06cdff2fe5b6b39a49644c684a161 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 2 Mar 2021 07:21:35 +0100
Subject: tty: cyclades, remove this orphan

The Cyclades driver was orphaned by commit d459883e6c54 (MAINTAINERS:
remove two dead e-mail) 13 years ago. Noone stepped up to take care of
them and to fix all the issues the driver has.

On the top of that, there is no way to obtain the firmware for Z cards
from the vendor as cyclades.com ceased to exist.

So it's time to drop the driver with all its traces.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210302062214.29627-5-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/devices.txt              |   10 -
 Documentation/driver-api/serial/cyclades_z.rst     |   11 -
 Documentation/driver-api/serial/index.rst          |    1 -
 Documentation/process/magic-number.rst             |    1 -
 .../translations/it_IT/process/magic-number.rst    |    1 -
 .../translations/zh_CN/process/magic-number.rst    |    1 -
 Documentation/userspace-api/ioctl/ioctl-number.rst |    1 -
 MAINTAINERS                                        |    7 -
 arch/powerpc/configs/ppc6xx_defconfig              |    1 -
 drivers/tty/Kconfig                                |   31 +-
 drivers/tty/Makefile                               |    1 -
 drivers/tty/cyclades.c                             | 4119 --------------------
 drivers/tty/serial/8250/Kconfig                    |    5 +-
 include/linux/cyclades.h                           |  364 --
 include/linux/pci_ids.h                            |    8 -
 include/uapi/linux/cyclades.h                      |  494 ---
 include/uapi/linux/major.h                         |    2 -
 include/uapi/linux/serial.h                        |    4 +-
 18 files changed, 5 insertions(+), 5057 deletions(-)
 delete mode 100644 Documentation/driver-api/serial/cyclades_z.rst
 delete mode 100644 drivers/tty/cyclades.c
 delete mode 100644 include/linux/cyclades.h
 delete mode 100644 include/uapi/linux/cyclades.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt
index 63fd4e6a014b..b5bd9d46e031 100644
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -477,11 +477,6 @@
   18 block	Sanyo CD-ROM
 		  0 = /dev/sjcd		Sanyo CD-ROM
 
-  19 char	Cyclades serial card
-		  0 = /dev/ttyC0	First Cyclades port
-		    ...
-		 31 = /dev/ttyC31	32nd Cyclades port
-
   19 block	"Double" compressed disk
 		  0 = /dev/double0	First compressed disk
 		    ...
@@ -493,11 +488,6 @@
 		See the Double documentation for the meaning of the
 		mirror devices.
 
-  20 char	Cyclades serial card - alternate devices
-		  0 = /dev/cub0		Callout device for ttyC0
-		    ...
-		 31 = /dev/cub31	Callout device for ttyC31
-
   20 block	Hitachi CD-ROM (under development)
 		  0 = /dev/hitcd	Hitachi CD-ROM
 
diff --git a/Documentation/driver-api/serial/cyclades_z.rst b/Documentation/driver-api/serial/cyclades_z.rst
deleted file mode 100644
index 532ff67e2f1c..000000000000
--- a/Documentation/driver-api/serial/cyclades_z.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-================
-Cyclades-Z notes
-================
-
-The Cyclades-Z must have firmware loaded onto the card before it will
-operate.  This operation should be performed during system startup,
-
-The firmware, loader program and the latest device driver code are
-available from Cyclades at
-
-    ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/
diff --git a/Documentation/driver-api/serial/index.rst b/Documentation/driver-api/serial/index.rst
index 33ad10d05b26..21351b8c95a4 100644
--- a/Documentation/driver-api/serial/index.rst
+++ b/Documentation/driver-api/serial/index.rst
@@ -17,7 +17,6 @@ Serial drivers
 .. toctree::
     :maxdepth: 1
 
-    cyclades_z
     moxa-smartio
     n_gsm
     rocket
diff --git a/Documentation/process/magic-number.rst b/Documentation/process/magic-number.rst
index fa5a62f4150c..d4a30c09bd03 100644
--- a/Documentation/process/magic-number.rst
+++ b/Documentation/process/magic-number.rst
@@ -73,7 +73,6 @@ CMAGIC                0x0111           user                     ``include/linux/
 MKISS_DRIVER_MAGIC    0x04bf           mkiss_channel            ``drivers/net/mkiss.h``
 HDLC_MAGIC            0x239e           n_hdlc                   ``drivers/char/n_hdlc.c``
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
-CYCLADES_MAGIC        0x4359           cyclades_port            ``include/linux/cyclades.h``
 DB_MAGIC              0x4442           fc_info                  ``drivers/net/iph5526_novram.c``
 DL_MAGIC              0x444d           fc_info                  ``drivers/net/iph5526_novram.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
diff --git a/Documentation/translations/it_IT/process/magic-number.rst b/Documentation/translations/it_IT/process/magic-number.rst
index 1af30f4228f2..0df2e7e32cd8 100644
--- a/Documentation/translations/it_IT/process/magic-number.rst
+++ b/Documentation/translations/it_IT/process/magic-number.rst
@@ -79,7 +79,6 @@ CMAGIC                0x0111           user                     ``include/linux/
 MKISS_DRIVER_MAGIC    0x04bf           mkiss_channel            ``drivers/net/mkiss.h``
 HDLC_MAGIC            0x239e           n_hdlc                   ``drivers/char/n_hdlc.c``
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
-CYCLADES_MAGIC        0x4359           cyclades_port            ``include/linux/cyclades.h``
 DB_MAGIC              0x4442           fc_info                  ``drivers/net/iph5526_novram.c``
 DL_MAGIC              0x444d           fc_info                  ``drivers/net/iph5526_novram.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
diff --git a/Documentation/translations/zh_CN/process/magic-number.rst b/Documentation/translations/zh_CN/process/magic-number.rst
index 7bb9d4165ed3..82d62f6a4406 100644
--- a/Documentation/translations/zh_CN/process/magic-number.rst
+++ b/Documentation/translations/zh_CN/process/magic-number.rst
@@ -62,7 +62,6 @@ CMAGIC                0x0111           user                     ``include/linux/
 MKISS_DRIVER_MAGIC    0x04bf           mkiss_channel            ``drivers/net/mkiss.h``
 HDLC_MAGIC            0x239e           n_hdlc                   ``drivers/char/n_hdlc.c``
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
-CYCLADES_MAGIC        0x4359           cyclades_port            ``include/linux/cyclades.h``
 DB_MAGIC              0x4442           fc_info                  ``drivers/net/iph5526_novram.c``
 DL_MAGIC              0x444d           fc_info                  ``drivers/net/iph5526_novram.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 599bd4493944..0a7b408c0ec7 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -209,7 +209,6 @@ Code  Seq#    Include File                                           Comments
              linux/fs.h,
 'X'   all    fs/ocfs2/ocfs_fs.h                                      conflict!
 'X'   01     linux/pktcdvd.h                                         conflict!
-'Y'   all    linux/cyclades.h
 'Z'   14-15  drivers/message/fusion/mptctl.h
 '['   00-3F  linux/usb/tmc.h                                         USB Test and Measurement Devices
                                                                      <mailto:gregkh@linuxfoundation.org>
diff --git a/MAINTAINERS b/MAINTAINERS
index 363530db37ac..29f20a97d73d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4876,13 +4876,6 @@ S:	Maintained
 W:	http://www.armlinux.org.uk/
 F:	drivers/video/fbdev/cyber2000fb.*
 
-CYCLADES ASYNC MUX DRIVER
-S:	Orphan
-W:	http://www.cyclades.com/
-F:	drivers/tty/cyclades.c
-F:	include/linux/cyclades.h
-F:	include/uapi/linux/cyclades.h
-
 CYCLADES PC300 DRIVER
 S:	Orphan
 F:	drivers/net/wan/pc300*
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index 6677ac0da45a..1fd9d1260f9e 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -595,7 +595,6 @@ CONFIG_GAMEPORT_FM801=m
 # CONFIG_LEGACY_PTYS is not set
 CONFIG_SERIAL_NONSTANDARD=y
 CONFIG_ROCKETPORT=m
-CONFIG_CYCLADES=m
 CONFIG_SYNCLINK_GT=m
 CONFIG_NOZOMI=m
 CONFIG_N_HDLC=m
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index e15cd6b5bb99..397523a8095e 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -181,7 +181,7 @@ config SERIAL_NONSTANDARD
 	help
 	  Say Y here if you have any non-standard serial boards -- boards
 	  which aren't supported using the standard "dumb" serial driver.
-	  This includes intelligent serial boards such as Cyclades,
+	  This includes intelligent serial boards such as
 	  Digiboards, etc. These are usually used for systems that need many
 	  serial ports because they serve many terminals or dial-in
 	  connections.
@@ -207,35 +207,6 @@ config ROCKETPORT
 	  If you want to compile this driver into the kernel, say Y here.  If
 	  you don't have a Comtrol RocketPort/RocketModem card installed, say N.
 
-config CYCLADES
-	tristate "Cyclades async mux support"
-	depends on SERIAL_NONSTANDARD && (PCI || ISA)
-	select FW_LOADER
-	help
-	  This driver supports Cyclades Z and Y multiserial boards.
-	  You would need something like this to connect more than two modems to
-	  your Linux box, for instance in order to become a dial-in server.
-
-	  For information about the Cyclades-Z card, read
-	  <file:Documentation/driver-api/serial/cyclades_z.rst>.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called cyclades.
-
-	  If you haven't heard about it, it's safe to say N.
-
-config CYZ_INTR
-	bool "Cyclades-Z interrupt mode operation"
-	depends on CYCLADES && PCI
-	help
-	  The Cyclades-Z family of multiport cards allows 2 (two) driver op
-	  modes: polling and interrupt. In polling mode, the driver will check
-	  the status of the Cyclades-Z ports every certain amount of time
-	  (which is called polling cycle and is configurable). In interrupt
-	  mode, it will use an interrupt line (IRQ) in order to check the
-	  status of the Cyclades-Z ports. The default op mode is polling. If
-	  unsure, say N.
-
 config MOXA_INTELLIO
 	tristate "Moxa Intellio support"
 	depends on SERIAL_NONSTANDARD && (ISA || EISA || PCI)
diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index 730de6bf048b..94eb2bf75763 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_SERIAL_DEV_BUS)	+= serdev/
 
 # tty drivers
 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
-obj-$(CONFIG_CYCLADES)		+= cyclades.o
 obj-$(CONFIG_ISI)		+= isicom.o
 obj-$(CONFIG_MOXA_INTELLIO)	+= moxa.o
 obj-$(CONFIG_MOXA_SMARTIO)	+= mxser.o
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
deleted file mode 100644
index 097266342e5e..000000000000
--- a/drivers/tty/cyclades.c
+++ /dev/null
@@ -1,4119 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#undef	BLOCKMOVE
-#define	Z_WAKE
-#undef	Z_EXT_CHARS_IN_BUFFER
-
-/*
- * This file contains the driver for the Cyclades async multiport
- * serial boards.
- *
- * Initially written by Randolph Bentson <bentson@grieg.seaslug.org>.
- * Modified and maintained by Marcio Saito <marcio@cyclades.com>.
- *
- * Copyright (C) 2007-2009 Jiri Slaby <jirislaby@gmail.com>
- *
- * Much of the design and some of the code came from serial.c
- * which was copyright (C) 1991, 1992  Linus Torvalds.  It was
- * extensively rewritten by Theodore Ts'o, 8/16/92 -- 9/14/92,
- * and then fixed as suggested by Michael K. Johnson 12/12/92.
- * Converted to pci probing and cleaned up by Jiri Slaby.
- *
- */
-
-#define CY_VERSION	"2.6"
-
-/* If you need to install more boards than NR_CARDS, change the constant
-   in the definition below. No other change is necessary to support up to
-   eight boards. Beyond that you'll have to extend cy_isa_addresses. */
-
-#define NR_CARDS	4
-
-/*
-   If the total number of ports is larger than NR_PORTS, change this
-   constant in the definition below. No other change is necessary to
-   support more boards/ports. */
-
-#define NR_PORTS	256
-
-#define ZO_V1	0
-#define ZO_V2	1
-#define ZE_V1	2
-
-#define	SERIAL_PARANOIA_CHECK
-#undef	CY_DEBUG_OPEN
-#undef	CY_DEBUG_THROTTLE
-#undef	CY_DEBUG_OTHER
-#undef	CY_DEBUG_IO
-#undef	CY_DEBUG_COUNT
-#undef	CY_DEBUG_DTR
-#undef	CY_DEBUG_INTERRUPTS
-#undef	CY_16Y_HACK
-#undef	CY_ENABLE_MONITORING
-#undef	CY_PCI_DEBUG
-
-/*
- * Include section
- */
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/interrupt.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-#include <linux/serial.h>
-#include <linux/major.h>
-#include <linux/string.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/cyclades.h>
-#include <linux/mm.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/bitops.h>
-#include <linux/firmware.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-
-#include <linux/io.h>
-#include <linux/uaccess.h>
-
-#include <linux/kernel.h>
-#include <linux/pci.h>
-
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-
-static void cy_send_xchar(struct tty_struct *tty, char ch);
-
-#ifndef SERIAL_XMIT_SIZE
-#define	SERIAL_XMIT_SIZE	(min(PAGE_SIZE, 4096))
-#endif
-
-/* firmware stuff */
-#define ZL_MAX_BLOCKS	16
-#define DRIVER_VERSION	0x02010203
-#define RAM_SIZE 0x80000
-
-enum zblock_type {
-	ZBLOCK_PRG = 0,
-	ZBLOCK_FPGA = 1
-};
-
-struct zfile_header {
-	char name[64];
-	char date[32];
-	char aux[32];
-	u32 n_config;
-	u32 config_offset;
-	u32 n_blocks;
-	u32 block_offset;
-	u32 reserved[9];
-} __attribute__ ((packed));
-
-struct zfile_config {
-	char name[64];
-	u32 mailbox;
-	u32 function;
-	u32 n_blocks;
-	u32 block_list[ZL_MAX_BLOCKS];
-} __attribute__ ((packed));
-
-struct zfile_block {
-	u32 type;
-	u32 file_offset;
-	u32 ram_offset;
-	u32 size;
-} __attribute__ ((packed));
-
-static struct tty_driver *cy_serial_driver;
-
-#ifdef CONFIG_ISA
-/* This is the address lookup table. The driver will probe for
-   Cyclom-Y/ISA boards at all addresses in here. If you want the
-   driver to probe addresses at a different address, add it to
-   this table.  If the driver is probing some other board and
-   causing problems, remove the offending address from this table.
-*/
-
-static unsigned int cy_isa_addresses[] = {
-	0xD0000,
-	0xD2000,
-	0xD4000,
-	0xD6000,
-	0xD8000,
-	0xDA000,
-	0xDC000,
-	0xDE000,
-	0, 0, 0, 0, 0, 0, 0, 0
-};
-
-#define NR_ISA_ADDRS ARRAY_SIZE(cy_isa_addresses)
-
-static long maddr[NR_CARDS];
-static int irq[NR_CARDS];
-
-module_param_hw_array(maddr, long, iomem, NULL, 0);
-module_param_hw_array(irq, int, irq, NULL, 0);
-
-#endif				/* CONFIG_ISA */
-
-/* This is the per-card data structure containing address, irq, number of
-   channels, etc. This driver supports a maximum of NR_CARDS cards.
-*/
-static struct cyclades_card cy_card[NR_CARDS];
-
-static int cy_next_channel;	/* next minor available */
-
-/*
- * This is used to look up the divisor speeds and the timeouts
- * We're normally limited to 15 distinct baud rates.  The extra
- * are accessed via settings in info->port.flags.
- *      0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
- *     10,    11,    12,    13,    14,    15,    16,    17,    18,    19,
- *                                               HI            VHI
- *     20
- */
-static const int baud_table[] = {
-	0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
-	1800, 2400, 4800, 9600, 19200, 38400, 57600, 76800, 115200, 150000,
-	230400, 0
-};
-
-static const char baud_co_25[] = {	/* 25 MHz clock option table */
-	/* value =>    00    01   02    03    04 */
-	/* divide by    8    32   128   512  2048 */
-	0x00, 0x04, 0x04, 0x04, 0x04, 0x04, 0x03, 0x03, 0x03, 0x02,
-	0x02, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-};
-
-static const char baud_bpr_25[] = {	/* 25 MHz baud rate period table */
-	0x00, 0xf5, 0xa3, 0x6f, 0x5c, 0x51, 0xf5, 0xa3, 0x51, 0xa3,
-	0x6d, 0x51, 0xa3, 0x51, 0xa3, 0x51, 0x36, 0x29, 0x1b, 0x15
-};
-
-static const char baud_co_60[] = {	/* 60 MHz clock option table (CD1400 J) */
-	/* value =>    00    01   02    03    04 */
-	/* divide by    8    32   128   512  2048 */
-	0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x04, 0x03, 0x03,
-	0x03, 0x02, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00
-};
-
-static const char baud_bpr_60[] = {	/* 60 MHz baud rate period table (CD1400 J) */
-	0x00, 0x82, 0x21, 0xff, 0xdb, 0xc3, 0x92, 0x62, 0xc3, 0x62,
-	0x41, 0xc3, 0x62, 0xc3, 0x62, 0xc3, 0x82, 0x62, 0x41, 0x32,
-	0x21
-};
-
-static const char baud_cor3[] = {	/* receive threshold */
-	0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-	0x0a, 0x0a, 0x0a, 0x09, 0x09, 0x08, 0x08, 0x08, 0x08, 0x07,
-	0x07
-};
-
-/*
- * The Cyclades driver implements HW flow control as any serial driver.
- * The cyclades_port structure member rflow and the vector rflow_thr
- * allows us to take advantage of a special feature in the CD1400 to avoid
- * data loss even when the system interrupt latency is too high. These flags
- * are to be used only with very special applications. Setting these flags
- * requires the use of a special cable (DTR and RTS reversed). In the new
- * CD1400-based boards (rev. 6.00 or later), there is no need for special
- * cables.
- */
-
-static const char rflow_thr[] = {	/* rflow threshold */
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00, 0x00, 0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-	0x0a
-};
-
-/*  The Cyclom-Ye has placed the sequential chips in non-sequential
- *  address order.  This look-up table overcomes that problem.
- */
-static const unsigned int cy_chip_offset[] = { 0x0000,
-	0x0400,
-	0x0800,
-	0x0C00,
-	0x0200,
-	0x0600,
-	0x0A00,
-	0x0E00
-};
-
-/* PCI related definitions */
-
-#ifdef CONFIG_PCI
-static const struct pci_device_id cy_pci_dev_id[] = {
-	/* PCI < 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_Y_Lo) },
-	/* PCI > 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_Y_Hi) },
-	/* 4Y PCI < 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_4Y_Lo) },
-	/* 4Y PCI > 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_4Y_Hi) },
-	/* 8Y PCI < 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_8Y_Lo) },
-	/* 8Y PCI > 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_8Y_Hi) },
-	/* Z PCI < 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_Z_Lo) },
-	/* Z PCI > 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_Z_Hi) },
-	{ }			/* end of table */
-};
-MODULE_DEVICE_TABLE(pci, cy_pci_dev_id);
-#endif
-
-static void cy_start(struct tty_struct *);
-static void cy_set_line_char(struct cyclades_port *, struct tty_struct *);
-static int cyz_issue_cmd(struct cyclades_card *, __u32, __u8, __u32);
-#ifdef CONFIG_ISA
-static unsigned detect_isa_irq(void __iomem *);
-#endif				/* CONFIG_ISA */
-
-#ifndef CONFIG_CYZ_INTR
-static void cyz_poll(struct timer_list *);
-
-/* The Cyclades-Z polling cycle is defined by this variable */
-static long cyz_polling_cycle = CZ_DEF_POLL;
-
-static DEFINE_TIMER(cyz_timerlist, cyz_poll);
-
-#else				/* CONFIG_CYZ_INTR */
-static void cyz_rx_restart(struct timer_list *);
-#endif				/* CONFIG_CYZ_INTR */
-
-static void cyy_writeb(struct cyclades_port *port, u32 reg, u8 val)
-{
-	struct cyclades_card *card = port->card;
-
-	cy_writeb(port->u.cyy.base_addr + (reg << card->bus_index), val);
-}
-
-static u8 cyy_readb(struct cyclades_port *port, u32 reg)
-{
-	struct cyclades_card *card = port->card;
-
-	return readb(port->u.cyy.base_addr + (reg << card->bus_index));
-}
-
-static inline bool cy_is_Z(struct cyclades_card *card)
-{
-	return card->num_chips == (unsigned int)-1;
-}
-
-static inline bool __cyz_fpga_loaded(struct RUNTIME_9060 __iomem *ctl_addr)
-{
-	return readl(&ctl_addr->init_ctrl) & (1 << 17);
-}
-
-static inline bool cyz_fpga_loaded(struct cyclades_card *card)
-{
-	return __cyz_fpga_loaded(card->ctl_addr.p9060);
-}
-
-static bool cyz_is_loaded(struct cyclades_card *card)
-{
-	struct FIRM_ID __iomem *fw_id = card->base_addr + ID_ADDRESS;
-
-	return (card->hw_ver == ZO_V1 || cyz_fpga_loaded(card)) &&
-			readl(&fw_id->signature) == ZFIRM_ID;
-}
-
-static int serial_paranoia_check(struct cyclades_port *info,
-		const char *name, const char *routine)
-{
-#ifdef SERIAL_PARANOIA_CHECK
-	if (!info) {
-		printk(KERN_WARNING "cyc Warning: null cyclades_port for (%s) "
-				"in %s\n", name, routine);
-		return 1;
-	}
-
-	if (info->magic != CYCLADES_MAGIC) {
-		printk(KERN_WARNING "cyc Warning: bad magic number for serial "
-				"struct (%s) in %s\n", name, routine);
-		return 1;
-	}
-#endif
-	return 0;
-}
-
-/***********************************************************/
-/********* Start of block of Cyclom-Y specific code ********/
-
-/* This routine waits up to 1000 micro-seconds for the previous
-   command to the Cirrus chip to complete and then issues the
-   new command.  An error is returned if the previous command
-   didn't finish within the time limit.
-
-   This function is only called from inside spinlock-protected code.
- */
-static int __cyy_issue_cmd(void __iomem *base_addr, u8 cmd, int index)
-{
-	void __iomem *ccr = base_addr + (CyCCR << index);
-	unsigned int i;
-
-	/* Check to see that the previous command has completed */
-	for (i = 0; i < 100; i++) {
-		if (readb(ccr) == 0)
-			break;
-		udelay(10L);
-	}
-	/* if the CCR never cleared, the previous command
-	   didn't finish within the "reasonable time" */
-	if (i == 100)
-		return -1;
-
-	/* Issue the new command */
-	cy_writeb(ccr, cmd);
-
-	return 0;
-}
-
-static inline int cyy_issue_cmd(struct cyclades_port *port, u8 cmd)
-{
-	return __cyy_issue_cmd(port->u.cyy.base_addr, cmd,
-			port->card->bus_index);
-}
-
-#ifdef CONFIG_ISA
-/* ISA interrupt detection code */
-static unsigned detect_isa_irq(void __iomem *address)
-{
-	int irq;
-	unsigned long irqs, flags;
-	int save_xir, save_car;
-	int index = 0;		/* IRQ probing is only for ISA */
-
-	/* forget possible initially masked and pending IRQ */
-	irq = probe_irq_off(probe_irq_on());
-
-	/* Clear interrupts on the board first */
-	cy_writeb(address + (Cy_ClrIntr << index), 0);
-	/* Cy_ClrIntr is 0x1800 */
-
-	irqs = probe_irq_on();
-	/* Wait ... */
-	msleep(5);
-
-	/* Enable the Tx interrupts on the CD1400 */
-	local_irq_save(flags);
-	cy_writeb(address + (CyCAR << index), 0);
-	__cyy_issue_cmd(address, CyCHAN_CTL | CyENB_XMTR, index);
-
-	cy_writeb(address + (CyCAR << index), 0);
-	cy_writeb(address + (CySRER << index),
-		  readb(address + (CySRER << index)) | CyTxRdy);
-	local_irq_restore(flags);
-
-	/* Wait ... */
-	msleep(5);
-
-	/* Check which interrupt is in use */
-	irq = probe_irq_off(irqs);
-
-	/* Clean up */
-	save_xir = (u_char) readb(address + (CyTIR << index));
-	save_car = readb(address + (CyCAR << index));
-	cy_writeb(address + (CyCAR << index), (save_xir & 0x3));
-	cy_writeb(address + (CySRER << index),
-		  readb(address + (CySRER << index)) & ~CyTxRdy);
-	cy_writeb(address + (CyTIR << index), (save_xir & 0x3f));
-	cy_writeb(address + (CyCAR << index), (save_car));
-	cy_writeb(address + (Cy_ClrIntr << index), 0);
-	/* Cy_ClrIntr is 0x1800 */
-
-	return (irq > 0) ? irq : 0;
-}
-#endif				/* CONFIG_ISA */
-
-static void cyy_chip_rx(struct cyclades_card *cinfo, int chip,
-		void __iomem *base_addr)
-{
-	struct cyclades_port *info;
-	struct tty_port *port;
-	int len, index = cinfo->bus_index;
-	u8 ivr, save_xir, channel, save_car, data, char_count;
-
-#ifdef CY_DEBUG_INTERRUPTS
-	printk(KERN_DEBUG "cyy_interrupt: rcvd intr, chip %d\n", chip);
-#endif
-	/* determine the channel & change to that context */
-	save_xir = readb(base_addr + (CyRIR << index));
-	channel = save_xir & CyIRChannel;
-	info = &cinfo->ports[channel + chip * 4];
-	port = &info->port;
-	save_car = cyy_readb(info, CyCAR);
-	cyy_writeb(info, CyCAR, save_xir);
-	ivr = cyy_readb(info, CyRIVR) & CyIVRMask;
-
-	/* there is an open port for this data */
-	if (ivr == CyIVRRxEx) {	/* exception */
-		data = cyy_readb(info, CyRDSR);
-
-		/* For statistics only */
-		if (data & CyBREAK)
-			info->icount.brk++;
-		else if (data & CyFRAME)
-			info->icount.frame++;
-		else if (data & CyPARITY)
-			info->icount.parity++;
-		else if (data & CyOVERRUN)
-			info->icount.overrun++;
-
-		if (data & info->ignore_status_mask) {
-			info->icount.rx++;
-			return;
-		}
-		if (tty_buffer_request_room(port, 1)) {
-			if (data & info->read_status_mask) {
-				if (data & CyBREAK) {
-					tty_insert_flip_char(port,
-						cyy_readb(info, CyRDSR),
-						TTY_BREAK);
-					info->icount.rx++;
-					if (port->flags & ASYNC_SAK) {
-						struct tty_struct *tty =
-							tty_port_tty_get(port);
-						if (tty) {
-							do_SAK(tty);
-							tty_kref_put(tty);
-						}
-					}
-				} else if (data & CyFRAME) {
-					tty_insert_flip_char(port,
-						cyy_readb(info, CyRDSR),
-						TTY_FRAME);
-					info->icount.rx++;
-					info->idle_stats.frame_errs++;
-				} else if (data & CyPARITY) {
-					/* Pieces of seven... */
-					tty_insert_flip_char(port,
-						cyy_readb(info, CyRDSR),
-						TTY_PARITY);
-					info->icount.rx++;
-					info->idle_stats.parity_errs++;
-				} else if (data & CyOVERRUN) {
-					tty_insert_flip_char(port, 0,
-							TTY_OVERRUN);
-					info->icount.rx++;
-					/* If the flip buffer itself is
-					   overflowing, we still lose
-					   the next incoming character.
-					 */
-					tty_insert_flip_char(port,
-						cyy_readb(info, CyRDSR),
-						TTY_FRAME);
-					info->icount.rx++;
-					info->idle_stats.overruns++;
-				/* These two conditions may imply */
-				/* a normal read should be done. */
-				/* } else if(data & CyTIMEOUT) { */
-				/* } else if(data & CySPECHAR) { */
-				} else {
-					tty_insert_flip_char(port, 0,
-							TTY_NORMAL);
-					info->icount.rx++;
-				}
-			} else {
-				tty_insert_flip_char(port, 0, TTY_NORMAL);
-				info->icount.rx++;
-			}
-		} else {
-			/* there was a software buffer overrun and nothing
-			 * could be done about it!!! */
-			info->icount.buf_overrun++;
-			info->idle_stats.overruns++;
-		}
-	} else {	/* normal character reception */
-		/* load # chars available from the chip */
-		char_count = cyy_readb(info, CyRDCR);
-
-#ifdef CY_ENABLE_MONITORING
-		++info->mon.int_count;
-		info->mon.char_count += char_count;
-		if (char_count > info->mon.char_max)
-			info->mon.char_max = char_count;
-		info->mon.char_last = char_count;
-#endif
-		len = tty_buffer_request_room(port, char_count);
-		while (len--) {
-			data = cyy_readb(info, CyRDSR);
-			tty_insert_flip_char(port, data, TTY_NORMAL);
-			info->idle_stats.recv_bytes++;
-			info->icount.rx++;
-#ifdef CY_16Y_HACK
-			udelay(10L);
-#endif
-		}
-		info->idle_stats.recv_idle = jiffies;
-	}
-	tty_schedule_flip(port);
-
-	/* end of service */
-	cyy_writeb(info, CyRIR, save_xir & 0x3f);
-	cyy_writeb(info, CyCAR, save_car);
-}
-
-static void cyy_chip_tx(struct cyclades_card *cinfo, unsigned int chip,
-		void __iomem *base_addr)
-{
-	struct cyclades_port *info;
-	struct tty_struct *tty;
-	int char_count, index = cinfo->bus_index;
-	u8 save_xir, channel, save_car, outch;
-
-	/* Since we only get here when the transmit buffer
-	   is empty, we know we can always stuff a dozen
-	   characters. */
-#ifdef CY_DEBUG_INTERRUPTS
-	printk(KERN_DEBUG "cyy_interrupt: xmit intr, chip %d\n", chip);
-#endif
-
-	/* determine the channel & change to that context */
-	save_xir = readb(base_addr + (CyTIR << index));
-	channel = save_xir & CyIRChannel;
-	save_car = readb(base_addr + (CyCAR << index));
-	cy_writeb(base_addr + (CyCAR << index), save_xir);
-
-	info = &cinfo->ports[channel + chip * 4];
-	tty = tty_port_tty_get(&info->port);
-	if (tty == NULL) {
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyTxRdy);
-		goto end;
-	}
-
-	/* load the on-chip space for outbound data */
-	char_count = info->xmit_fifo_size;
-
-	if (info->x_char) {	/* send special char */
-		outch = info->x_char;
-		cyy_writeb(info, CyTDR, outch);
-		char_count--;
-		info->icount.tx++;
-		info->x_char = 0;
-	}
-
-	if (info->breakon || info->breakoff) {
-		if (info->breakon) {
-			cyy_writeb(info, CyTDR, 0);
-			cyy_writeb(info, CyTDR, 0x81);
-			info->breakon = 0;
-			char_count -= 2;
-		}
-		if (info->breakoff) {
-			cyy_writeb(info, CyTDR, 0);
-			cyy_writeb(info, CyTDR, 0x83);
-			info->breakoff = 0;
-			char_count -= 2;
-		}
-	}
-
-	while (char_count-- > 0) {
-		if (!info->xmit_cnt) {
-			if (cyy_readb(info, CySRER) & CyTxMpty) {
-				cyy_writeb(info, CySRER,
-					cyy_readb(info, CySRER) & ~CyTxMpty);
-			} else {
-				cyy_writeb(info, CySRER, CyTxMpty |
-					(cyy_readb(info, CySRER) & ~CyTxRdy));
-			}
-			goto done;
-		}
-		if (info->port.xmit_buf == NULL) {
-			cyy_writeb(info, CySRER,
-				cyy_readb(info, CySRER) & ~CyTxRdy);
-			goto done;
-		}
-		if (tty->stopped || tty->hw_stopped) {
-			cyy_writeb(info, CySRER,
-				cyy_readb(info, CySRER) & ~CyTxRdy);
-			goto done;
-		}
-		/* Because the Embedded Transmit Commands have been enabled,
-		 * we must check to see if the escape character, NULL, is being
-		 * sent. If it is, we must ensure that there is room for it to
-		 * be doubled in the output stream.  Therefore we no longer
-		 * advance the pointer when the character is fetched, but
-		 * rather wait until after the check for a NULL output
-		 * character. This is necessary because there may not be room
-		 * for the two chars needed to send a NULL.)
-		 */
-		outch = info->port.xmit_buf[info->xmit_tail];
-		if (outch) {
-			info->xmit_cnt--;
-			info->xmit_tail = (info->xmit_tail + 1) &
-					(SERIAL_XMIT_SIZE - 1);
-			cyy_writeb(info, CyTDR, outch);
-			info->icount.tx++;
-		} else {
-			if (char_count > 1) {
-				info->xmit_cnt--;
-				info->xmit_tail = (info->xmit_tail + 1) &
-					(SERIAL_XMIT_SIZE - 1);
-				cyy_writeb(info, CyTDR, outch);
-				cyy_writeb(info, CyTDR, 0);
-				info->icount.tx++;
-				char_count--;
-			}
-		}
-	}
-
-done:
-	tty_wakeup(tty);
-	tty_kref_put(tty);
-end:
-	/* end of service */
-	cyy_writeb(info, CyTIR, save_xir & 0x3f);
-	cyy_writeb(info, CyCAR, save_car);
-}
-
-static void cyy_chip_modem(struct cyclades_card *cinfo, int chip,
-		void __iomem *base_addr)
-{
-	struct cyclades_port *info;
-	struct tty_struct *tty;
-	int index = cinfo->bus_index;
-	u8 save_xir, channel, save_car, mdm_change, mdm_status;
-
-	/* determine the channel & change to that context */
-	save_xir = readb(base_addr + (CyMIR << index));
-	channel = save_xir & CyIRChannel;
-	info = &cinfo->ports[channel + chip * 4];
-	save_car = cyy_readb(info, CyCAR);
-	cyy_writeb(info, CyCAR, save_xir);
-
-	mdm_change = cyy_readb(info, CyMISR);
-	mdm_status = cyy_readb(info, CyMSVR1);
-
-	tty = tty_port_tty_get(&info->port);
-	if (!tty)
-		goto end;
-
-	if (mdm_change & CyANY_DELTA) {
-		/* For statistics only */
-		if (mdm_change & CyDCD)
-			info->icount.dcd++;
-		if (mdm_change & CyCTS)
-			info->icount.cts++;
-		if (mdm_change & CyDSR)
-			info->icount.dsr++;
-		if (mdm_change & CyRI)
-			info->icount.rng++;
-
-		wake_up_interruptible(&info->port.delta_msr_wait);
-	}
-
-	if ((mdm_change & CyDCD) && tty_port_check_carrier(&info->port)) {
-		if (mdm_status & CyDCD)
-			wake_up_interruptible(&info->port.open_wait);
-		else
-			tty_hangup(tty);
-	}
-	if ((mdm_change & CyCTS) && tty_port_cts_enabled(&info->port)) {
-		if (tty->hw_stopped) {
-			if (mdm_status & CyCTS) {
-				/* cy_start isn't used
-				   because... !!! */
-				tty->hw_stopped = 0;
-				cyy_writeb(info, CySRER,
-					cyy_readb(info, CySRER) | CyTxRdy);
-				tty_wakeup(tty);
-			}
-		} else {
-			if (!(mdm_status & CyCTS)) {
-				/* cy_stop isn't used
-				   because ... !!! */
-				tty->hw_stopped = 1;
-				cyy_writeb(info, CySRER,
-					cyy_readb(info, CySRER) & ~CyTxRdy);
-			}
-		}
-	}
-/*	if (mdm_change & CyDSR) {
-	}
-	if (mdm_change & CyRI) {
-	}*/
-	tty_kref_put(tty);
-end:
-	/* end of service */
-	cyy_writeb(info, CyMIR, save_xir & 0x3f);
-	cyy_writeb(info, CyCAR, save_car);
-}
-
-/* The real interrupt service routine is called
-   whenever the card wants its hand held--chars
-   received, out buffer empty, modem change, etc.
- */
-static irqreturn_t cyy_interrupt(int irq, void *dev_id)
-{
-	int status;
-	struct cyclades_card *cinfo = dev_id;
-	void __iomem *base_addr, *card_base_addr;
-	unsigned int chip, too_many, had_work;
-	int index;
-
-	if (unlikely(cinfo == NULL)) {
-#ifdef CY_DEBUG_INTERRUPTS
-		printk(KERN_DEBUG "cyy_interrupt: spurious interrupt %d\n",
-				irq);
-#endif
-		return IRQ_NONE;	/* spurious interrupt */
-	}
-
-	card_base_addr = cinfo->base_addr;
-	index = cinfo->bus_index;
-
-	/* card was not initialized yet (e.g. DEBUG_SHIRQ) */
-	if (unlikely(card_base_addr == NULL))
-		return IRQ_HANDLED;
-
-	/* This loop checks all chips in the card.  Make a note whenever
-	   _any_ chip had some work to do, as this is considered an
-	   indication that there will be more to do.  Only when no chip
-	   has any work does this outermost loop exit.
-	 */
-	do {
-		had_work = 0;
-		for (chip = 0; chip < cinfo->num_chips; chip++) {
-			base_addr = cinfo->base_addr +
-					(cy_chip_offset[chip] << index);
-			too_many = 0;
-			while ((status = readb(base_addr +
-						(CySVRR << index))) != 0x00) {
-				had_work++;
-			/* The purpose of the following test is to ensure that
-			   no chip can monopolize the driver.  This forces the
-			   chips to be checked in a round-robin fashion (after
-			   draining each of a bunch (1000) of characters).
-			 */
-				if (1000 < too_many++)
-					break;
-				spin_lock(&cinfo->card_lock);
-				if (status & CySRReceive) /* rx intr */
-					cyy_chip_rx(cinfo, chip, base_addr);
-				if (status & CySRTransmit) /* tx intr */
-					cyy_chip_tx(cinfo, chip, base_addr);
-				if (status & CySRModem) /* modem intr */
-					cyy_chip_modem(cinfo, chip, base_addr);
-				spin_unlock(&cinfo->card_lock);
-			}
-		}
-	} while (had_work);
-
-	/* clear interrupts */
-	spin_lock(&cinfo->card_lock);
-	cy_writeb(card_base_addr + (Cy_ClrIntr << index), 0);
-	/* Cy_ClrIntr is 0x1800 */
-	spin_unlock(&cinfo->card_lock);
-	return IRQ_HANDLED;
-}				/* cyy_interrupt */
-
-static void cyy_change_rts_dtr(struct cyclades_port *info, unsigned int set,
-		unsigned int clear)
-{
-	struct cyclades_card *card = info->card;
-	int channel = info->line - card->first_line;
-	u32 rts, dtr, msvrr, msvrd;
-
-	channel &= 0x03;
-
-	if (info->rtsdtr_inv) {
-		msvrr = CyMSVR2;
-		msvrd = CyMSVR1;
-		rts = CyDTR;
-		dtr = CyRTS;
-	} else {
-		msvrr = CyMSVR1;
-		msvrd = CyMSVR2;
-		rts = CyRTS;
-		dtr = CyDTR;
-	}
-	if (set & TIOCM_RTS) {
-		cyy_writeb(info, CyCAR, channel);
-		cyy_writeb(info, msvrr, rts);
-	}
-	if (clear & TIOCM_RTS) {
-		cyy_writeb(info, CyCAR, channel);
-		cyy_writeb(info, msvrr, ~rts);
-	}
-	if (set & TIOCM_DTR) {
-		cyy_writeb(info, CyCAR, channel);
-		cyy_writeb(info, msvrd, dtr);
-#ifdef CY_DEBUG_DTR
-		printk(KERN_DEBUG "cyc:set_modem_info raising DTR\n");
-		printk(KERN_DEBUG "     status: 0x%x, 0x%x\n",
-			cyy_readb(info, CyMSVR1),
-			cyy_readb(info, CyMSVR2));
-#endif
-	}
-	if (clear & TIOCM_DTR) {
-		cyy_writeb(info, CyCAR, channel);
-		cyy_writeb(info, msvrd, ~dtr);
-#ifdef CY_DEBUG_DTR
-		printk(KERN_DEBUG "cyc:set_modem_info dropping DTR\n");
-		printk(KERN_DEBUG "     status: 0x%x, 0x%x\n",
-			cyy_readb(info, CyMSVR1),
-			cyy_readb(info, CyMSVR2));
-#endif
-	}
-}
-
-/***********************************************************/
-/********* End of block of Cyclom-Y specific code **********/
-/******** Start of block of Cyclades-Z specific code *******/
-/***********************************************************/
-
-static int
-cyz_fetch_msg(struct cyclades_card *cinfo,
-		__u32 *channel, __u8 *cmd, __u32 *param)
-{
-	struct BOARD_CTRL __iomem *board_ctrl = cinfo->board_ctrl;
-	unsigned long loc_doorbell;
-
-	loc_doorbell = readl(&cinfo->ctl_addr.p9060->loc_doorbell);
-	if (loc_doorbell) {
-		*cmd = (char)(0xff & loc_doorbell);
-		*channel = readl(&board_ctrl->fwcmd_channel);
-		*param = (__u32) readl(&board_ctrl->fwcmd_param);
-		cy_writel(&cinfo->ctl_addr.p9060->loc_doorbell, 0xffffffff);
-		return 1;
-	}
-	return 0;
-}				/* cyz_fetch_msg */
-
-static int
-cyz_issue_cmd(struct cyclades_card *cinfo,
-		__u32 channel, __u8 cmd, __u32 param)
-{
-	struct BOARD_CTRL __iomem *board_ctrl = cinfo->board_ctrl;
-	__u32 __iomem *pci_doorbell;
-	unsigned int index;
-
-	if (!cyz_is_loaded(cinfo))
-		return -1;
-
-	index = 0;
-	pci_doorbell = &cinfo->ctl_addr.p9060->pci_doorbell;
-	while ((readl(pci_doorbell) & 0xff) != 0) {
-		if (index++ == 1000)
-			return (int)(readl(pci_doorbell) & 0xff);
-		udelay(50L);
-	}
-	cy_writel(&board_ctrl->hcmd_channel, channel);
-	cy_writel(&board_ctrl->hcmd_param, param);
-	cy_writel(pci_doorbell, (long)cmd);
-
-	return 0;
-}				/* cyz_issue_cmd */
-
-static void cyz_handle_rx(struct cyclades_port *info)
-{
-	struct BUF_CTRL __iomem *buf_ctrl = info->u.cyz.buf_ctrl;
-	struct cyclades_card *cinfo = info->card;
-	struct tty_port *port = &info->port;
-	unsigned int char_count;
-	int len;
-#ifdef BLOCKMOVE
-	unsigned char *buf;
-#else
-	char data;
-#endif
-	__u32 rx_put, rx_get, new_rx_get, rx_bufsize, rx_bufaddr;
-
-	rx_get = new_rx_get = readl(&buf_ctrl->rx_get);
-	rx_put = readl(&buf_ctrl->rx_put);
-	rx_bufsize = readl(&buf_ctrl->rx_bufsize);
-	rx_bufaddr = readl(&buf_ctrl->rx_bufaddr);
-	if (rx_put >= rx_get)
-		char_count = rx_put - rx_get;
-	else
-		char_count = rx_put - rx_get + rx_bufsize;
-
-	if (!char_count)
-		return;
-
-#ifdef CY_ENABLE_MONITORING
-	info->mon.int_count++;
-	info->mon.char_count += char_count;
-	if (char_count > info->mon.char_max)
-		info->mon.char_max = char_count;
-	info->mon.char_last = char_count;
-#endif
-
-#ifdef BLOCKMOVE
-	/* we'd like to use memcpy(t, f, n) and memset(s, c, count)
-	   for performance, but because of buffer boundaries, there
-	   may be several steps to the operation */
-	while (1) {
-		len = tty_prepare_flip_string(port, &buf,
-				char_count);
-		if (!len)
-			break;
-
-		len = min_t(unsigned int, min(len, char_count),
-				rx_bufsize - new_rx_get);
-
-		memcpy_fromio(buf, cinfo->base_addr +
-				rx_bufaddr + new_rx_get, len);
-
-		new_rx_get = (new_rx_get + len) &
-				(rx_bufsize - 1);
-		char_count -= len;
-		info->icount.rx += len;
-		info->idle_stats.recv_bytes += len;
-	}
-#else
-	len = tty_buffer_request_room(port, char_count);
-	while (len--) {
-		data = readb(cinfo->base_addr + rx_bufaddr +
-				new_rx_get);
-		new_rx_get = (new_rx_get + 1) &
-					(rx_bufsize - 1);
-		tty_insert_flip_char(port, data, TTY_NORMAL);
-		info->idle_stats.recv_bytes++;
-		info->icount.rx++;
-	}
-#endif
-#ifdef CONFIG_CYZ_INTR
-	/* Recalculate the number of chars in the RX buffer and issue
-	   a cmd in case it's higher than the RX high water mark */
-	rx_put = readl(&buf_ctrl->rx_put);
-	if (rx_put >= rx_get)
-		char_count = rx_put - rx_get;
-	else
-		char_count = rx_put - rx_get + rx_bufsize;
-	if (char_count >= readl(&buf_ctrl->rx_threshold) &&
-			!timer_pending(&info->rx_full_timer))
-		mod_timer(&info->rx_full_timer, jiffies + 1);
-#endif
-	info->idle_stats.recv_idle = jiffies;
-	tty_schedule_flip(&info->port);
-
-	/* Update rx_get */
-	cy_writel(&buf_ctrl->rx_get, new_rx_get);
-}
-
-static void cyz_handle_tx(struct cyclades_port *info)
-{
-	struct BUF_CTRL __iomem *buf_ctrl = info->u.cyz.buf_ctrl;
-	struct cyclades_card *cinfo = info->card;
-	struct tty_struct *tty;
-	u8 data;
-	unsigned int char_count;
-#ifdef BLOCKMOVE
-	int small_count;
-#endif
-	__u32 tx_put, tx_get, tx_bufsize, tx_bufaddr;
-
-	if (info->xmit_cnt <= 0)	/* Nothing to transmit */
-		return;
-
-	tx_get = readl(&buf_ctrl->tx_get);
-	tx_put = readl(&buf_ctrl->tx_put);
-	tx_bufsize = readl(&buf_ctrl->tx_bufsize);
-	tx_bufaddr = readl(&buf_ctrl->tx_bufaddr);
-	if (tx_put >= tx_get)
-		char_count = tx_get - tx_put - 1 + tx_bufsize;
-	else
-		char_count = tx_get - tx_put - 1;
-
-	if (!char_count)
-		return;
-		
-	tty = tty_port_tty_get(&info->port);
-	if (tty == NULL)
-		goto ztxdone;
-
-	if (info->x_char) {	/* send special char */
-		data = info->x_char;
-
-		cy_writeb(cinfo->base_addr + tx_bufaddr + tx_put, data);
-		tx_put = (tx_put + 1) & (tx_bufsize - 1);
-		info->x_char = 0;
-		char_count--;
-		info->icount.tx++;
-	}
-#ifdef BLOCKMOVE
-	while (0 < (small_count = min_t(unsigned int,
-			tx_bufsize - tx_put, min_t(unsigned int,
-				(SERIAL_XMIT_SIZE - info->xmit_tail),
-				min_t(unsigned int, info->xmit_cnt,
-					char_count))))) {
-
-		memcpy_toio((char *)(cinfo->base_addr + tx_bufaddr + tx_put),
-				&info->port.xmit_buf[info->xmit_tail],
-				small_count);
-
-		tx_put = (tx_put + small_count) & (tx_bufsize - 1);
-		char_count -= small_count;
-		info->icount.tx += small_count;
-		info->xmit_cnt -= small_count;
-		info->xmit_tail = (info->xmit_tail + small_count) &
-				(SERIAL_XMIT_SIZE - 1);
-	}
-#else
-	while (info->xmit_cnt && char_count) {
-		data = info->port.xmit_buf[info->xmit_tail];
-		info->xmit_cnt--;
-		info->xmit_tail = (info->xmit_tail + 1) &
-				(SERIAL_XMIT_SIZE - 1);
-
-		cy_writeb(cinfo->base_addr + tx_bufaddr + tx_put, data);
-		tx_put = (tx_put + 1) & (tx_bufsize - 1);
-		char_count--;
-		info->icount.tx++;
-	}
-#endif
-	tty_wakeup(tty);
-	tty_kref_put(tty);
-ztxdone:
-	/* Update tx_put */
-	cy_writel(&buf_ctrl->tx_put, tx_put);
-}
-
-static void cyz_handle_cmd(struct cyclades_card *cinfo)
-{
-	struct BOARD_CTRL __iomem *board_ctrl = cinfo->board_ctrl;
-	struct cyclades_port *info;
-	__u32 channel, param, fw_ver;
-	__u8 cmd;
-	int special_count;
-	int delta_count;
-
-	fw_ver = readl(&board_ctrl->fw_version);
-
-	while (cyz_fetch_msg(cinfo, &channel, &cmd, &param) == 1) {
-		special_count = 0;
-		delta_count = 0;
-		info = &cinfo->ports[channel];
-
-		switch (cmd) {
-		case C_CM_PR_ERROR:
-			tty_insert_flip_char(&info->port, 0, TTY_PARITY);
-			info->icount.rx++;
-			special_count++;
-			break;
-		case C_CM_FR_ERROR:
-			tty_insert_flip_char(&info->port, 0, TTY_FRAME);
-			info->icount.rx++;
-			special_count++;
-			break;
-		case C_CM_RXBRK:
-			tty_insert_flip_char(&info->port, 0, TTY_BREAK);
-			info->icount.rx++;
-			special_count++;
-			break;
-		case C_CM_MDCD:
-			info->icount.dcd++;
-			delta_count++;
-			if (tty_port_check_carrier(&info->port)) {
-				u32 dcd = fw_ver > 241 ? param :
-					readl(&info->u.cyz.ch_ctrl->rs_status);
-				if (dcd & C_RS_DCD)
-					wake_up_interruptible(&info->port.open_wait);
-				else
-					tty_port_tty_hangup(&info->port, false);
-			}
-			break;
-		case C_CM_MCTS:
-			info->icount.cts++;
-			delta_count++;
-			break;
-		case C_CM_MRI:
-			info->icount.rng++;
-			delta_count++;
-			break;
-		case C_CM_MDSR:
-			info->icount.dsr++;
-			delta_count++;
-			break;
-#ifdef Z_WAKE
-		case C_CM_IOCTLW:
-			complete(&info->shutdown_wait);
-			break;
-#endif
-#ifdef CONFIG_CYZ_INTR
-		case C_CM_RXHIWM:
-		case C_CM_RXNNDT:
-		case C_CM_INTBACK2:
-			/* Reception Interrupt */
-#ifdef CY_DEBUG_INTERRUPTS
-			printk(KERN_DEBUG "cyz_interrupt: rcvd intr, card %d, "
-					"port %ld\n", info->card, channel);
-#endif
-			cyz_handle_rx(info);
-			break;
-		case C_CM_TXBEMPTY:
-		case C_CM_TXLOWWM:
-		case C_CM_INTBACK:
-			/* Transmission Interrupt */
-#ifdef CY_DEBUG_INTERRUPTS
-			printk(KERN_DEBUG "cyz_interrupt: xmit intr, card %d, "
-					"port %ld\n", info->card, channel);
-#endif
-			cyz_handle_tx(info);
-			break;
-#endif				/* CONFIG_CYZ_INTR */
-		case C_CM_FATAL:
-			/* should do something with this !!! */
-			break;
-		default:
-			break;
-		}
-		if (delta_count)
-			wake_up_interruptible(&info->port.delta_msr_wait);
-		if (special_count)
-			tty_schedule_flip(&info->port);
-	}
-}
-
-#ifdef CONFIG_CYZ_INTR
-static irqreturn_t cyz_interrupt(int irq, void *dev_id)
-{
-	struct cyclades_card *cinfo = dev_id;
-
-	if (unlikely(!cyz_is_loaded(cinfo))) {
-#ifdef CY_DEBUG_INTERRUPTS
-		printk(KERN_DEBUG "cyz_interrupt: board not yet loaded "
-				"(IRQ%d).\n", irq);
-#endif
-		return IRQ_NONE;
-	}
-
-	/* Handle the interrupts */
-	cyz_handle_cmd(cinfo);
-
-	return IRQ_HANDLED;
-}				/* cyz_interrupt */
-
-static void cyz_rx_restart(struct timer_list *t)
-{
-	struct cyclades_port *info = from_timer(info, t, rx_full_timer);
-	struct cyclades_card *card = info->card;
-	int retval;
-	__u32 channel = info->line - card->first_line;
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-	retval = cyz_issue_cmd(card, channel, C_CM_INTBACK2, 0L);
-	if (retval != 0) {
-		printk(KERN_ERR "cyc:cyz_rx_restart retval on ttyC%d was %x\n",
-			info->line, retval);
-	}
-	spin_unlock_irqrestore(&card->card_lock, flags);
-}
-
-#else				/* CONFIG_CYZ_INTR */
-
-static void cyz_poll(struct timer_list *unused)
-{
-	struct cyclades_card *cinfo;
-	struct cyclades_port *info;
-	unsigned long expires = jiffies + HZ;
-	unsigned int port, card;
-
-	for (card = 0; card < NR_CARDS; card++) {
-		cinfo = &cy_card[card];
-
-		if (!cy_is_Z(cinfo))
-			continue;
-		if (!cyz_is_loaded(cinfo))
-			continue;
-
-	/* Skip first polling cycle to avoid racing conditions with the FW */
-		if (!cinfo->intr_enabled) {
-			cinfo->intr_enabled = 1;
-			continue;
-		}
-
-		cyz_handle_cmd(cinfo);
-
-		for (port = 0; port < cinfo->nports; port++) {
-			info = &cinfo->ports[port];
-
-			if (!info->throttle)
-				cyz_handle_rx(info);
-			cyz_handle_tx(info);
-		}
-		/* poll every 'cyz_polling_cycle' period */
-		expires = jiffies + cyz_polling_cycle;
-	}
-	mod_timer(&cyz_timerlist, expires);
-}				/* cyz_poll */
-
-#endif				/* CONFIG_CYZ_INTR */
-
-/********** End of block of Cyclades-Z specific code *********/
-/***********************************************************/
-
-/* This is called whenever a port becomes active;
-   interrupts are enabled and DTR & RTS are turned on.
- */
-static int cy_startup(struct cyclades_port *info, struct tty_struct *tty)
-{
-	struct cyclades_card *card;
-	unsigned long flags;
-	int retval = 0;
-	int channel;
-	unsigned long page;
-
-	card = info->card;
-	channel = info->line - card->first_line;
-
-	page = get_zeroed_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-
-	if (tty_port_initialized(&info->port))
-		goto errout;
-
-	if (!info->type) {
-		set_bit(TTY_IO_ERROR, &tty->flags);
-		goto errout;
-	}
-
-	if (info->port.xmit_buf)
-		free_page(page);
-	else
-		info->port.xmit_buf = (unsigned char *)page;
-
-	spin_unlock_irqrestore(&card->card_lock, flags);
-
-	cy_set_line_char(info, tty);
-
-	if (!cy_is_Z(card)) {
-		channel &= 0x03;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-
-		cyy_writeb(info, CyCAR, channel);
-
-		cyy_writeb(info, CyRTPR,
-			(info->default_timeout ? info->default_timeout : 0x02));
-		/* 10ms rx timeout */
-
-		cyy_issue_cmd(info, CyCHAN_CTL | CyENB_RCVR | CyENB_XMTR);
-
-		cyy_change_rts_dtr(info, TIOCM_RTS | TIOCM_DTR, 0);
-
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyRxData);
-	} else {
-		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-
-		if (!cyz_is_loaded(card))
-			return -ENODEV;
-
-#ifdef CY_DEBUG_OPEN
-		printk(KERN_DEBUG "cyc startup Z card %d, channel %d, "
-			"base_addr %p\n", card, channel, card->base_addr);
-#endif
-		spin_lock_irqsave(&card->card_lock, flags);
-
-		cy_writel(&ch_ctrl->op_mode, C_CH_ENABLE);
-#ifdef Z_WAKE
-#ifdef CONFIG_CYZ_INTR
-		cy_writel(&ch_ctrl->intr_enable,
-			  C_IN_TXBEMPTY | C_IN_TXLOWWM | C_IN_RXHIWM |
-			  C_IN_RXNNDT | C_IN_IOCTLW | C_IN_MDCD);
-#else
-		cy_writel(&ch_ctrl->intr_enable,
-			  C_IN_IOCTLW | C_IN_MDCD);
-#endif				/* CONFIG_CYZ_INTR */
-#else
-#ifdef CONFIG_CYZ_INTR
-		cy_writel(&ch_ctrl->intr_enable,
-			  C_IN_TXBEMPTY | C_IN_TXLOWWM | C_IN_RXHIWM |
-			  C_IN_RXNNDT | C_IN_MDCD);
-#else
-		cy_writel(&ch_ctrl->intr_enable, C_IN_MDCD);
-#endif				/* CONFIG_CYZ_INTR */
-#endif				/* Z_WAKE */
-
-		retval = cyz_issue_cmd(card, channel, C_CM_IOCTL, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:startup(1) retval on ttyC%d was "
-				"%x\n", info->line, retval);
-		}
-
-		/* Flush RX buffers before raising DTR and RTS */
-		retval = cyz_issue_cmd(card, channel, C_CM_FLUSH_RX, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:startup(2) retval on ttyC%d was "
-				"%x\n", info->line, retval);
-		}
-
-		/* set timeout !!! */
-		/* set RTS and DTR !!! */
-		tty_port_raise_dtr_rts(&info->port);
-
-		/* enable send, recv, modem !!! */
-	}
-
-	tty_port_set_initialized(&info->port, 1);
-
-	clear_bit(TTY_IO_ERROR, &tty->flags);
-	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
-	info->breakon = info->breakoff = 0;
-	memset((char *)&info->idle_stats, 0, sizeof(info->idle_stats));
-	info->idle_stats.in_use =
-	info->idle_stats.recv_idle =
-	info->idle_stats.xmit_idle = jiffies;
-
-	spin_unlock_irqrestore(&card->card_lock, flags);
-
-#ifdef CY_DEBUG_OPEN
-	printk(KERN_DEBUG "cyc startup done\n");
-#endif
-	return 0;
-
-errout:
-	spin_unlock_irqrestore(&card->card_lock, flags);
-	free_page(page);
-	return retval;
-}				/* startup */
-
-static void start_xmit(struct cyclades_port *info)
-{
-	struct cyclades_card *card = info->card;
-	unsigned long flags;
-	int channel = info->line - card->first_line;
-
-	if (!cy_is_Z(card)) {
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_writeb(info, CyCAR, channel & 0x03);
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyTxRdy);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	} else {
-#ifdef CONFIG_CYZ_INTR
-		int retval;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-		retval = cyz_issue_cmd(card, channel, C_CM_INTBACK, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:start_xmit retval on ttyC%d was "
-				"%x\n", info->line, retval);
-		}
-		spin_unlock_irqrestore(&card->card_lock, flags);
-#else				/* CONFIG_CYZ_INTR */
-		/* Don't have to do anything at this time */
-#endif				/* CONFIG_CYZ_INTR */
-	}
-}				/* start_xmit */
-
-/*
- * This routine shuts down a serial port; interrupts are disabled,
- * and DTR is dropped if the hangup on close termio flag is on.
- */
-static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
-{
-	struct cyclades_card *card;
-	unsigned long flags;
-
-	if (!tty_port_initialized(&info->port))
-		return;
-
-	card = info->card;
-	if (!cy_is_Z(card)) {
-		spin_lock_irqsave(&card->card_lock, flags);
-
-		/* Clear delta_msr_wait queue to avoid mem leaks. */
-		wake_up_interruptible(&info->port.delta_msr_wait);
-
-		if (info->port.xmit_buf) {
-			unsigned char *temp;
-			temp = info->port.xmit_buf;
-			info->port.xmit_buf = NULL;
-			free_page((unsigned long)temp);
-		}
-		if (C_HUPCL(tty))
-			cyy_change_rts_dtr(info, 0, TIOCM_RTS | TIOCM_DTR);
-
-		cyy_issue_cmd(info, CyCHAN_CTL | CyDIS_RCVR);
-		/* it may be appropriate to clear _XMIT at
-		   some later date (after testing)!!! */
-
-		set_bit(TTY_IO_ERROR, &tty->flags);
-		tty_port_set_initialized(&info->port, 0);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	} else {
-#ifdef CY_DEBUG_OPEN
-		int channel = info->line - card->first_line;
-		printk(KERN_DEBUG "cyc shutdown Z card %d, channel %d, "
-			"base_addr %p\n", card, channel, card->base_addr);
-#endif
-
-		if (!cyz_is_loaded(card))
-			return;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-
-		if (info->port.xmit_buf) {
-			unsigned char *temp;
-			temp = info->port.xmit_buf;
-			info->port.xmit_buf = NULL;
-			free_page((unsigned long)temp);
-		}
-
-		if (C_HUPCL(tty))
-			tty_port_lower_dtr_rts(&info->port);
-
-		set_bit(TTY_IO_ERROR, &tty->flags);
-		tty_port_set_initialized(&info->port, 0);
-
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	}
-
-#ifdef CY_DEBUG_OPEN
-	printk(KERN_DEBUG "cyc shutdown done\n");
-#endif
-}				/* shutdown */
-
-/*
- * ------------------------------------------------------------
- * cy_open() and friends
- * ------------------------------------------------------------
- */
-
-/*
- * This routine is called whenever a serial port is opened.  It
- * performs the serial-specific initialization for the tty structure.
- */
-static int cy_open(struct tty_struct *tty, struct file *filp)
-{
-	struct cyclades_port *info;
-	unsigned int i, line = tty->index;
-	int retval;
-
-	for (i = 0; i < NR_CARDS; i++)
-		if (line < cy_card[i].first_line + cy_card[i].nports &&
-				line >= cy_card[i].first_line)
-			break;
-	if (i >= NR_CARDS)
-		return -ENODEV;
-	info = &cy_card[i].ports[line - cy_card[i].first_line];
-	if (info->line < 0)
-		return -ENODEV;
-
-	/* If the card's firmware hasn't been loaded,
-	   treat it as absent from the system.  This
-	   will make the user pay attention.
-	 */
-	if (cy_is_Z(info->card)) {
-		struct cyclades_card *cinfo = info->card;
-		struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
-
-		if (!cyz_is_loaded(cinfo)) {
-			if (cinfo->hw_ver == ZE_V1 && cyz_fpga_loaded(cinfo) &&
-					readl(&firm_id->signature) ==
-					ZFIRM_HLT) {
-				printk(KERN_ERR "cyc:Cyclades-Z Error: you "
-					"need an external power supply for "
-					"this number of ports.\nFirmware "
-					"halted.\n");
-			} else {
-				printk(KERN_ERR "cyc:Cyclades-Z firmware not "
-					"yet loaded\n");
-			}
-			return -ENODEV;
-		}
-#ifdef CONFIG_CYZ_INTR
-		else {
-		/* In case this Z board is operating in interrupt mode, its
-		   interrupts should be enabled as soon as the first open
-		   happens to one of its ports. */
-			if (!cinfo->intr_enabled) {
-				u16 intr;
-
-				/* Enable interrupts on the PLX chip */
-				intr = readw(&cinfo->ctl_addr.p9060->
-						intr_ctrl_stat) | 0x0900;
-				cy_writew(&cinfo->ctl_addr.p9060->
-						intr_ctrl_stat, intr);
-				/* Enable interrupts on the FW */
-				retval = cyz_issue_cmd(cinfo, 0,
-						C_CM_IRQ_ENBL, 0L);
-				if (retval != 0) {
-					printk(KERN_ERR "cyc:IRQ enable retval "
-						"was %x\n", retval);
-				}
-				cinfo->intr_enabled = 1;
-			}
-		}
-#endif				/* CONFIG_CYZ_INTR */
-		/* Make sure this Z port really exists in hardware */
-		if (info->line > (cinfo->first_line + cinfo->nports - 1))
-			return -ENODEV;
-	}
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_open ttyC%d\n", info->line);
-#endif
-	tty->driver_data = info;
-	if (serial_paranoia_check(info, tty->name, "cy_open"))
-		return -ENODEV;
-
-#ifdef CY_DEBUG_OPEN
-	printk(KERN_DEBUG "cyc:cy_open ttyC%d, count = %d\n", info->line,
-			info->port.count);
-#endif
-	info->port.count++;
-#ifdef CY_DEBUG_COUNT
-	printk(KERN_DEBUG "cyc:cy_open (%d): incrementing count to %d\n",
-		current->pid, info->port.count);
-#endif
-
-	/*
-	 * Start up serial port
-	 */
-	retval = cy_startup(info, tty);
-	if (retval)
-		return retval;
-
-	retval = tty_port_block_til_ready(&info->port, tty, filp);
-	if (retval) {
-#ifdef CY_DEBUG_OPEN
-		printk(KERN_DEBUG "cyc:cy_open returning after block_til_ready "
-			"with %d\n", retval);
-#endif
-		return retval;
-	}
-
-	info->throttle = 0;
-	tty_port_tty_set(&info->port, tty);
-
-#ifdef CY_DEBUG_OPEN
-	printk(KERN_DEBUG "cyc:cy_open done\n");
-#endif
-	return 0;
-}				/* cy_open */
-
-/*
- * cy_wait_until_sent() --- wait until the transmitter is empty
- */
-static void cy_wait_until_sent(struct tty_struct *tty, int timeout)
-{
-	struct cyclades_card *card;
-	struct cyclades_port *info = tty->driver_data;
-	unsigned long orig_jiffies;
-	int char_time;
-
-	if (serial_paranoia_check(info, tty->name, "cy_wait_until_sent"))
-		return;
-
-	if (info->xmit_fifo_size == 0)
-		return;		/* Just in case.... */
-
-	orig_jiffies = jiffies;
-	/*
-	 * Set the check interval to be 1/5 of the estimated time to
-	 * send a single character, and make it at least 1.  The check
-	 * interval should also be less than the timeout.
-	 *
-	 * Note: we have to use pretty tight timings here to satisfy
-	 * the NIST-PCTS.
-	 */
-	char_time = (info->timeout - HZ / 50) / info->xmit_fifo_size;
-	char_time = char_time / 5;
-	if (char_time <= 0)
-		char_time = 1;
-	if (timeout < 0)
-		timeout = 0;
-	if (timeout)
-		char_time = min(char_time, timeout);
-	/*
-	 * If the transmitter hasn't cleared in twice the approximate
-	 * amount of time to send the entire FIFO, it probably won't
-	 * ever clear.  This assumes the UART isn't doing flow
-	 * control, which is currently the case.  Hence, if it ever
-	 * takes longer than info->timeout, this is probably due to a
-	 * UART bug of some kind.  So, we clamp the timeout parameter at
-	 * 2*info->timeout.
-	 */
-	if (!timeout || timeout > 2 * info->timeout)
-		timeout = 2 * info->timeout;
-
-	card = info->card;
-	if (!cy_is_Z(card)) {
-		while (cyy_readb(info, CySRER) & CyTxRdy) {
-			if (msleep_interruptible(jiffies_to_msecs(char_time)))
-				break;
-			if (timeout && time_after(jiffies, orig_jiffies +
-					timeout))
-				break;
-		}
-	}
-	/* Run one more char cycle */
-	msleep_interruptible(jiffies_to_msecs(char_time * 5));
-}
-
-static void cy_flush_buffer(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	int channel, retval;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_IO
-	printk(KERN_DEBUG "cyc:cy_flush_buffer ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_flush_buffer"))
-		return;
-
-	card = info->card;
-	channel = info->line - card->first_line;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
-	spin_unlock_irqrestore(&card->card_lock, flags);
-
-	if (cy_is_Z(card)) {	/* If it is a Z card, flush the on-board
-					   buffers as well */
-		spin_lock_irqsave(&card->card_lock, flags);
-		retval = cyz_issue_cmd(card, channel, C_CM_FLUSH_TX, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc: flush_buffer retval on ttyC%d "
-				"was %x\n", info->line, retval);
-		}
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	}
-	tty_wakeup(tty);
-}				/* cy_flush_buffer */
-
-
-static void cy_do_close(struct tty_port *port)
-{
-	struct cyclades_port *info = container_of(port, struct cyclades_port,
-								port);
-	struct cyclades_card *card;
-	unsigned long flags;
-	int channel;
-
-	card = info->card;
-	channel = info->line - card->first_line;
-	spin_lock_irqsave(&card->card_lock, flags);
-
-	if (!cy_is_Z(card)) {
-		/* Stop accepting input */
-		cyy_writeb(info, CyCAR, channel & 0x03);
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyRxData);
-		if (tty_port_initialized(&info->port)) {
-			/* Waiting for on-board buffers to be empty before
-			   closing the port */
-			spin_unlock_irqrestore(&card->card_lock, flags);
-			cy_wait_until_sent(port->tty, info->timeout);
-			spin_lock_irqsave(&card->card_lock, flags);
-		}
-	} else {
-#ifdef Z_WAKE
-		/* Waiting for on-board buffers to be empty before closing
-		   the port */
-		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-		int retval;
-
-		if (readl(&ch_ctrl->flow_status) != C_FS_TXIDLE) {
-			retval = cyz_issue_cmd(card, channel, C_CM_IOCTLW, 0L);
-			if (retval != 0) {
-				printk(KERN_DEBUG "cyc:cy_close retval on "
-					"ttyC%d was %x\n", info->line, retval);
-			}
-			spin_unlock_irqrestore(&card->card_lock, flags);
-			wait_for_completion_interruptible(&info->shutdown_wait);
-			spin_lock_irqsave(&card->card_lock, flags);
-		}
-#endif
-	}
-	spin_unlock_irqrestore(&card->card_lock, flags);
-	cy_shutdown(info, port->tty);
-}
-
-/*
- * This routine is called when a particular tty device is closed.
- */
-static void cy_close(struct tty_struct *tty, struct file *filp)
-{
-	struct cyclades_port *info = tty->driver_data;
-	if (!info || serial_paranoia_check(info, tty->name, "cy_close"))
-		return;
-	tty_port_close(&info->port, tty, filp);
-}				/* cy_close */
-
-/* This routine gets called when tty_write has put something into
- * the write_queue.  The characters may come from user space or
- * kernel space.
- *
- * This routine will return the number of characters actually
- * accepted for writing.
- *
- * If the port is not already transmitting stuff, start it off by
- * enabling interrupts.  The interrupt service routine will then
- * ensure that the characters are sent.
- * If the port is already active, there is no need to kick it.
- *
- */
-static int cy_write(struct tty_struct *tty, const unsigned char *buf, int count)
-{
-	struct cyclades_port *info = tty->driver_data;
-	unsigned long flags;
-	int c, ret = 0;
-
-#ifdef CY_DEBUG_IO
-	printk(KERN_DEBUG "cyc:cy_write ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_write"))
-		return 0;
-
-	if (!info->port.xmit_buf)
-		return 0;
-
-	spin_lock_irqsave(&info->card->card_lock, flags);
-	while (1) {
-		c = min(count, (int)(SERIAL_XMIT_SIZE - info->xmit_cnt - 1));
-		c = min(c, (int)(SERIAL_XMIT_SIZE - info->xmit_head));
-
-		if (c <= 0)
-			break;
-
-		memcpy(info->port.xmit_buf + info->xmit_head, buf, c);
-		info->xmit_head = (info->xmit_head + c) &
-			(SERIAL_XMIT_SIZE - 1);
-		info->xmit_cnt += c;
-		buf += c;
-		count -= c;
-		ret += c;
-	}
-	spin_unlock_irqrestore(&info->card->card_lock, flags);
-
-	info->idle_stats.xmit_bytes += ret;
-	info->idle_stats.xmit_idle = jiffies;
-
-	if (info->xmit_cnt && !tty->stopped && !tty->hw_stopped)
-		start_xmit(info);
-
-	return ret;
-}				/* cy_write */
-
-/*
- * This routine is called by the kernel to write a single
- * character to the tty device.  If the kernel uses this routine,
- * it must call the flush_chars() routine (if defined) when it is
- * done stuffing characters into the driver.  If there is no room
- * in the queue, the character is ignored.
- */
-static int cy_put_char(struct tty_struct *tty, unsigned char ch)
-{
-	struct cyclades_port *info = tty->driver_data;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_IO
-	printk(KERN_DEBUG "cyc:cy_put_char ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_put_char"))
-		return 0;
-
-	if (!info->port.xmit_buf)
-		return 0;
-
-	spin_lock_irqsave(&info->card->card_lock, flags);
-	if (info->xmit_cnt >= (int)(SERIAL_XMIT_SIZE - 1)) {
-		spin_unlock_irqrestore(&info->card->card_lock, flags);
-		return 0;
-	}
-
-	info->port.xmit_buf[info->xmit_head++] = ch;
-	info->xmit_head &= SERIAL_XMIT_SIZE - 1;
-	info->xmit_cnt++;
-	info->idle_stats.xmit_bytes++;
-	info->idle_stats.xmit_idle = jiffies;
-	spin_unlock_irqrestore(&info->card->card_lock, flags);
-	return 1;
-}				/* cy_put_char */
-
-/*
- * This routine is called by the kernel after it has written a
- * series of characters to the tty device using put_char().
- */
-static void cy_flush_chars(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-
-#ifdef CY_DEBUG_IO
-	printk(KERN_DEBUG "cyc:cy_flush_chars ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_flush_chars"))
-		return;
-
-	if (info->xmit_cnt <= 0 || tty->stopped || tty->hw_stopped ||
-			!info->port.xmit_buf)
-		return;
-
-	start_xmit(info);
-}				/* cy_flush_chars */
-
-/*
- * This routine returns the numbers of characters the tty driver
- * will accept for queuing to be written.  This number is subject
- * to change as output buffers get emptied, or if the output flow
- * control is activated.
- */
-static int cy_write_room(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-	int ret;
-
-#ifdef CY_DEBUG_IO
-	printk(KERN_DEBUG "cyc:cy_write_room ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_write_room"))
-		return 0;
-	ret = SERIAL_XMIT_SIZE - info->xmit_cnt - 1;
-	if (ret < 0)
-		ret = 0;
-	return ret;
-}				/* cy_write_room */
-
-static int cy_chars_in_buffer(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-
-	if (serial_paranoia_check(info, tty->name, "cy_chars_in_buffer"))
-		return 0;
-
-#ifdef Z_EXT_CHARS_IN_BUFFER
-	if (!cy_is_Z(info->card)) {
-#endif				/* Z_EXT_CHARS_IN_BUFFER */
-#ifdef CY_DEBUG_IO
-		printk(KERN_DEBUG "cyc:cy_chars_in_buffer ttyC%d %d\n",
-			info->line, info->xmit_cnt);
-#endif
-		return info->xmit_cnt;
-#ifdef Z_EXT_CHARS_IN_BUFFER
-	} else {
-		struct BUF_CTRL __iomem *buf_ctrl = info->u.cyz.buf_ctrl;
-		int char_count;
-		__u32 tx_put, tx_get, tx_bufsize;
-
-		tx_get = readl(&buf_ctrl->tx_get);
-		tx_put = readl(&buf_ctrl->tx_put);
-		tx_bufsize = readl(&buf_ctrl->tx_bufsize);
-		if (tx_put >= tx_get)
-			char_count = tx_put - tx_get;
-		else
-			char_count = tx_put - tx_get + tx_bufsize;
-#ifdef CY_DEBUG_IO
-		printk(KERN_DEBUG "cyc:cy_chars_in_buffer ttyC%d %d\n",
-			info->line, info->xmit_cnt + char_count);
-#endif
-		return info->xmit_cnt + char_count;
-	}
-#endif				/* Z_EXT_CHARS_IN_BUFFER */
-}				/* cy_chars_in_buffer */
-
-/*
- * ------------------------------------------------------------
- * cy_ioctl() and friends
- * ------------------------------------------------------------
- */
-
-static void cyy_baud_calc(struct cyclades_port *info, __u32 baud)
-{
-	int co, co_val, bpr;
-	__u32 cy_clock = ((info->chip_rev >= CD1400_REV_J) ? 60000000 :
-			25000000);
-
-	if (baud == 0) {
-		info->tbpr = info->tco = info->rbpr = info->rco = 0;
-		return;
-	}
-
-	/* determine which prescaler to use */
-	for (co = 4, co_val = 2048; co; co--, co_val >>= 2) {
-		if (cy_clock / co_val / baud > 63)
-			break;
-	}
-
-	bpr = (cy_clock / co_val * 2 / baud + 1) / 2;
-	if (bpr > 255)
-		bpr = 255;
-
-	info->tbpr = info->rbpr = bpr;
-	info->tco = info->rco = co;
-}
-
-/*
- * This routine finds or computes the various line characteristics.
- * It used to be called config_setup
- */
-static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
-{
-	struct cyclades_card *card;
-	unsigned long flags;
-	int channel;
-	unsigned cflag, iflag;
-	int baud, baud_rate = 0;
-	int i;
-
-	if (info->line == -1)
-		return;
-
-	cflag = tty->termios.c_cflag;
-	iflag = tty->termios.c_iflag;
-
-	card = info->card;
-	channel = info->line - card->first_line;
-
-	if (!cy_is_Z(card)) {
-		u32 cflags;
-
-		/* baud rate */
-		baud = tty_get_baud_rate(tty);
-		if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
-				ASYNC_SPD_CUST) {
-			if (info->custom_divisor)
-				baud_rate = info->baud / info->custom_divisor;
-			else
-				baud_rate = info->baud;
-		} else if (baud > CD1400_MAX_SPEED) {
-			baud = CD1400_MAX_SPEED;
-		}
-		/* find the baud index */
-		for (i = 0; i < 20; i++) {
-			if (baud == baud_table[i])
-				break;
-		}
-		if (i == 20)
-			i = 19;	/* CD1400_MAX_SPEED */
-
-		if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
-				ASYNC_SPD_CUST) {
-			cyy_baud_calc(info, baud_rate);
-		} else {
-			if (info->chip_rev >= CD1400_REV_J) {
-				/* It is a CD1400 rev. J or later */
-				info->tbpr = baud_bpr_60[i];	/* Tx BPR */
-				info->tco = baud_co_60[i];	/* Tx CO */
-				info->rbpr = baud_bpr_60[i];	/* Rx BPR */
-				info->rco = baud_co_60[i];	/* Rx CO */
-			} else {
-				info->tbpr = baud_bpr_25[i];	/* Tx BPR */
-				info->tco = baud_co_25[i];	/* Tx CO */
-				info->rbpr = baud_bpr_25[i];	/* Rx BPR */
-				info->rco = baud_co_25[i];	/* Rx CO */
-			}
-		}
-		if (baud_table[i] == 134) {
-			/* get it right for 134.5 baud */
-			info->timeout = (info->xmit_fifo_size * HZ * 30 / 269) +
-					2;
-		} else if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
-				ASYNC_SPD_CUST) {
-			info->timeout = (info->xmit_fifo_size * HZ * 15 /
-					baud_rate) + 2;
-		} else if (baud_table[i]) {
-			info->timeout = (info->xmit_fifo_size * HZ * 15 /
-					baud_table[i]) + 2;
-			/* this needs to be propagated into the card info */
-		} else {
-			info->timeout = 0;
-		}
-		/* By tradition (is it a standard?) a baud rate of zero
-		   implies the line should be/has been closed.  A bit
-		   later in this routine such a test is performed. */
-
-		/* byte size and parity */
-		info->cor5 = 0;
-		info->cor4 = 0;
-		/* receive threshold */
-		info->cor3 = (info->default_threshold ?
-				info->default_threshold : baud_cor3[i]);
-		info->cor2 = CyETC;
-		switch (cflag & CSIZE) {
-		case CS5:
-			info->cor1 = Cy_5_BITS;
-			break;
-		case CS6:
-			info->cor1 = Cy_6_BITS;
-			break;
-		case CS7:
-			info->cor1 = Cy_7_BITS;
-			break;
-		case CS8:
-			info->cor1 = Cy_8_BITS;
-			break;
-		}
-		if (cflag & CSTOPB)
-			info->cor1 |= Cy_2_STOP;
-
-		if (cflag & PARENB) {
-			if (cflag & PARODD)
-				info->cor1 |= CyPARITY_O;
-			else
-				info->cor1 |= CyPARITY_E;
-		} else
-			info->cor1 |= CyPARITY_NONE;
-
-		/* CTS flow control flag */
-		tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-		if (cflag & CRTSCTS)
-			info->cor2 |= CyCtsAE;
-		else
-			info->cor2 &= ~CyCtsAE;
-		tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
-
-	 /***********************************************
-	    The hardware option, CyRtsAO, presents RTS when
-	    the chip has characters to send.  Since most modems
-	    use RTS as reverse (inbound) flow control, this
-	    option is not used.  If inbound flow control is
-	    necessary, DTR can be programmed to provide the
-	    appropriate signals for use with a non-standard
-	    cable.  Contact Marcio Saito for details.
-	 ***********************************************/
-
-		channel &= 0x03;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_writeb(info, CyCAR, channel);
-
-		/* tx and rx baud rate */
-
-		cyy_writeb(info, CyTCOR, info->tco);
-		cyy_writeb(info, CyTBPR, info->tbpr);
-		cyy_writeb(info, CyRCOR, info->rco);
-		cyy_writeb(info, CyRBPR, info->rbpr);
-
-		/* set line characteristics  according configuration */
-
-		cyy_writeb(info, CySCHR1, START_CHAR(tty));
-		cyy_writeb(info, CySCHR2, STOP_CHAR(tty));
-		cyy_writeb(info, CyCOR1, info->cor1);
-		cyy_writeb(info, CyCOR2, info->cor2);
-		cyy_writeb(info, CyCOR3, info->cor3);
-		cyy_writeb(info, CyCOR4, info->cor4);
-		cyy_writeb(info, CyCOR5, info->cor5);
-
-		cyy_issue_cmd(info, CyCOR_CHANGE | CyCOR1ch | CyCOR2ch |
-				CyCOR3ch);
-
-		/* !!! Is this needed? */
-		cyy_writeb(info, CyCAR, channel);
-		cyy_writeb(info, CyRTPR,
-			(info->default_timeout ? info->default_timeout : 0x02));
-		/* 10ms rx timeout */
-
-		cflags = CyCTS;
-		if (!C_CLOCAL(tty))
-			cflags |= CyDSR | CyRI | CyDCD;
-		/* without modem intr */
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyMdmCh);
-		/* act on 1->0 modem transitions */
-		if ((cflag & CRTSCTS) && info->rflow)
-			cyy_writeb(info, CyMCOR1, cflags | rflow_thr[i]);
-		else
-			cyy_writeb(info, CyMCOR1, cflags);
-		/* act on 0->1 modem transitions */
-		cyy_writeb(info, CyMCOR2, cflags);
-
-		if (i == 0)	/* baud rate is zero, turn off line */
-			cyy_change_rts_dtr(info, 0, TIOCM_DTR);
-		else
-			cyy_change_rts_dtr(info, TIOCM_DTR, 0);
-
-		clear_bit(TTY_IO_ERROR, &tty->flags);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-
-	} else {
-		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-		__u32 sw_flow;
-		int retval;
-
-		if (!cyz_is_loaded(card))
-			return;
-
-		/* baud rate */
-		baud = tty_get_baud_rate(tty);
-		if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
-				ASYNC_SPD_CUST) {
-			if (info->custom_divisor)
-				baud_rate = info->baud / info->custom_divisor;
-			else
-				baud_rate = info->baud;
-		} else if (baud > CYZ_MAX_SPEED) {
-			baud = CYZ_MAX_SPEED;
-		}
-		cy_writel(&ch_ctrl->comm_baud, baud);
-
-		if (baud == 134) {
-			/* get it right for 134.5 baud */
-			info->timeout = (info->xmit_fifo_size * HZ * 30 / 269) +
-					2;
-		} else if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
-				ASYNC_SPD_CUST) {
-			info->timeout = (info->xmit_fifo_size * HZ * 15 /
-					baud_rate) + 2;
-		} else if (baud) {
-			info->timeout = (info->xmit_fifo_size * HZ * 15 /
-					baud) + 2;
-			/* this needs to be propagated into the card info */
-		} else {
-			info->timeout = 0;
-		}
-
-		/* byte size and parity */
-		switch (cflag & CSIZE) {
-		case CS5:
-			cy_writel(&ch_ctrl->comm_data_l, C_DL_CS5);
-			break;
-		case CS6:
-			cy_writel(&ch_ctrl->comm_data_l, C_DL_CS6);
-			break;
-		case CS7:
-			cy_writel(&ch_ctrl->comm_data_l, C_DL_CS7);
-			break;
-		case CS8:
-			cy_writel(&ch_ctrl->comm_data_l, C_DL_CS8);
-			break;
-		}
-		if (cflag & CSTOPB) {
-			cy_writel(&ch_ctrl->comm_data_l,
-				  readl(&ch_ctrl->comm_data_l) | C_DL_2STOP);
-		} else {
-			cy_writel(&ch_ctrl->comm_data_l,
-				  readl(&ch_ctrl->comm_data_l) | C_DL_1STOP);
-		}
-		if (cflag & PARENB) {
-			if (cflag & PARODD)
-				cy_writel(&ch_ctrl->comm_parity, C_PR_ODD);
-			else
-				cy_writel(&ch_ctrl->comm_parity, C_PR_EVEN);
-		} else
-			cy_writel(&ch_ctrl->comm_parity, C_PR_NONE);
-
-		/* CTS flow control flag */
-		if (cflag & CRTSCTS) {
-			cy_writel(&ch_ctrl->hw_flow,
-				readl(&ch_ctrl->hw_flow) | C_RS_CTS | C_RS_RTS);
-		} else {
-			cy_writel(&ch_ctrl->hw_flow, readl(&ch_ctrl->hw_flow) &
-					~(C_RS_CTS | C_RS_RTS));
-		}
-		/* As the HW flow control is done in firmware, the driver
-		   doesn't need to care about it */
-		tty_port_set_cts_flow(&info->port, 0);
-
-		/* XON/XOFF/XANY flow control flags */
-		sw_flow = 0;
-		if (iflag & IXON) {
-			sw_flow |= C_FL_OXX;
-			if (iflag & IXANY)
-				sw_flow |= C_FL_OIXANY;
-		}
-		cy_writel(&ch_ctrl->sw_flow, sw_flow);
-
-		retval = cyz_issue_cmd(card, channel, C_CM_IOCTL, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:set_line_char retval on ttyC%d "
-				"was %x\n", info->line, retval);
-		}
-
-		/* CD sensitivity */
-		tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
-
-		if (baud == 0) {	/* baud rate is zero, turn off line */
-			cy_writel(&ch_ctrl->rs_control,
-				  readl(&ch_ctrl->rs_control) & ~C_RS_DTR);
-#ifdef CY_DEBUG_DTR
-			printk(KERN_DEBUG "cyc:set_line_char dropping Z DTR\n");
-#endif
-		} else {
-			cy_writel(&ch_ctrl->rs_control,
-				  readl(&ch_ctrl->rs_control) | C_RS_DTR);
-#ifdef CY_DEBUG_DTR
-			printk(KERN_DEBUG "cyc:set_line_char raising Z DTR\n");
-#endif
-		}
-
-		retval = cyz_issue_cmd(card, channel, C_CM_IOCTLM, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:set_line_char(2) retval on ttyC%d "
-				"was %x\n", info->line, retval);
-		}
-
-		clear_bit(TTY_IO_ERROR, &tty->flags);
-	}
-}				/* set_line_char */
-
-static int cy_get_serial_info(struct tty_struct *tty,
-				struct serial_struct *ss)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *cinfo = info->card;
-
-	if (serial_paranoia_check(info, tty->name, "cy_ioctl"))
-		return -ENODEV;
-	ss->type = info->type;
-	ss->line = info->line;
-	ss->port = (info->card - cy_card) * 0x100 + info->line -
-			cinfo->first_line;
-	ss->irq = cinfo->irq;
-	ss->flags = info->port.flags;
-	ss->close_delay = info->port.close_delay;
-	ss->closing_wait = info->port.closing_wait;
-	ss->baud_base = info->baud;
-	ss->custom_divisor = info->custom_divisor;
-	return 0;
-}
-
-static int cy_set_serial_info(struct tty_struct *tty,
-				struct serial_struct *ss)
-{
-	struct cyclades_port *info = tty->driver_data;
-	int old_flags;
-	int ret;
-
-	if (serial_paranoia_check(info, tty->name, "cy_ioctl"))
-		return -ENODEV;
-
-	mutex_lock(&info->port.mutex);
-
-	old_flags = info->port.flags;
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		if (ss->close_delay != info->port.close_delay ||
-				ss->baud_base != info->baud ||
-				(ss->flags & ASYNC_FLAGS &
-					~ASYNC_USR_MASK) !=
-				(info->port.flags & ASYNC_FLAGS & ~ASYNC_USR_MASK))
-		{
-			mutex_unlock(&info->port.mutex);
-			return -EPERM;
-		}
-		info->port.flags = (info->port.flags & ~ASYNC_USR_MASK) |
-				(ss->flags & ASYNC_USR_MASK);
-		info->baud = ss->baud_base;
-		info->custom_divisor = ss->custom_divisor;
-		goto check_and_exit;
-	}
-
-	/*
-	 * OK, past this point, all the error checking has been done.
-	 * At this point, we start making changes.....
-	 */
-
-	info->baud = ss->baud_base;
-	info->custom_divisor = ss->custom_divisor;
-	info->port.flags = (info->port.flags & ~ASYNC_FLAGS) |
-			(ss->flags & ASYNC_FLAGS);
-	info->port.close_delay = ss->close_delay * HZ / 100;
-	info->port.closing_wait = ss->closing_wait * HZ / 100;
-
-check_and_exit:
-	if (tty_port_initialized(&info->port)) {
-		if ((ss->flags ^ old_flags) & ASYNC_SPD_MASK) {
-			/* warn about deprecation unless clearing */
-			if (ss->flags & ASYNC_SPD_MASK)
-				dev_warn_ratelimited(tty->dev, "use of SPD flags is deprecated\n");
-		}
-		cy_set_line_char(info, tty);
-		ret = 0;
-	} else {
-		ret = cy_startup(info, tty);
-	}
-	mutex_unlock(&info->port.mutex);
-	return ret;
-}				/* set_serial_info */
-
-/*
- * get_lsr_info - get line status register info
- *
- * Purpose: Let user call ioctl() to get info when the UART physically
- *	    is emptied.  On bus types like RS485, the transmitter must
- *	    release the bus after transmitting. This must be done when
- *	    the transmit shift register is empty, not be done when the
- *	    transmit holding register is empty.  This functionality
- *	    allows an RS485 driver to be written in user space.
- */
-static int get_lsr_info(struct cyclades_port *info, unsigned int __user *value)
-{
-	struct cyclades_card *card = info->card;
-	unsigned int result;
-	unsigned long flags;
-	u8 status;
-
-	if (!cy_is_Z(card)) {
-		spin_lock_irqsave(&card->card_lock, flags);
-		status = cyy_readb(info, CySRER) & (CyTxRdy | CyTxMpty);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-		result = (status ? 0 : TIOCSER_TEMT);
-	} else {
-		/* Not supported yet */
-		return -EINVAL;
-	}
-	return put_user(result, value);
-}
-
-static int cy_tiocmget(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	int result;
-
-	if (serial_paranoia_check(info, tty->name, __func__))
-		return -ENODEV;
-
-	card = info->card;
-
-	if (!cy_is_Z(card)) {
-		unsigned long flags;
-		int channel = info->line - card->first_line;
-		u8 status;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_writeb(info, CyCAR, channel & 0x03);
-		status = cyy_readb(info, CyMSVR1);
-		status |= cyy_readb(info, CyMSVR2);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-
-		if (info->rtsdtr_inv) {
-			result = ((status & CyRTS) ? TIOCM_DTR : 0) |
-				((status & CyDTR) ? TIOCM_RTS : 0);
-		} else {
-			result = ((status & CyRTS) ? TIOCM_RTS : 0) |
-				((status & CyDTR) ? TIOCM_DTR : 0);
-		}
-		result |= ((status & CyDCD) ? TIOCM_CAR : 0) |
-			((status & CyRI) ? TIOCM_RNG : 0) |
-			((status & CyDSR) ? TIOCM_DSR : 0) |
-			((status & CyCTS) ? TIOCM_CTS : 0);
-	} else {
-		u32 lstatus;
-
-		if (!cyz_is_loaded(card)) {
-			result = -ENODEV;
-			goto end;
-		}
-
-		lstatus = readl(&info->u.cyz.ch_ctrl->rs_status);
-		result = ((lstatus & C_RS_RTS) ? TIOCM_RTS : 0) |
-			((lstatus & C_RS_DTR) ? TIOCM_DTR : 0) |
-			((lstatus & C_RS_DCD) ? TIOCM_CAR : 0) |
-			((lstatus & C_RS_RI) ? TIOCM_RNG : 0) |
-			((lstatus & C_RS_DSR) ? TIOCM_DSR : 0) |
-			((lstatus & C_RS_CTS) ? TIOCM_CTS : 0);
-	}
-end:
-	return result;
-}				/* cy_tiomget */
-
-static int
-cy_tiocmset(struct tty_struct *tty,
-		unsigned int set, unsigned int clear)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	unsigned long flags;
-
-	if (serial_paranoia_check(info, tty->name, __func__))
-		return -ENODEV;
-
-	card = info->card;
-	if (!cy_is_Z(card)) {
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_change_rts_dtr(info, set, clear);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	} else {
-		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-		int retval, channel = info->line - card->first_line;
-		u32 rs;
-
-		if (!cyz_is_loaded(card))
-			return -ENODEV;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-		rs = readl(&ch_ctrl->rs_control);
-		if (set & TIOCM_RTS)
-			rs |= C_RS_RTS;
-		if (clear & TIOCM_RTS)
-			rs &= ~C_RS_RTS;
-		if (set & TIOCM_DTR) {
-			rs |= C_RS_DTR;
-#ifdef CY_DEBUG_DTR
-			printk(KERN_DEBUG "cyc:set_modem_info raising Z DTR\n");
-#endif
-		}
-		if (clear & TIOCM_DTR) {
-			rs &= ~C_RS_DTR;
-#ifdef CY_DEBUG_DTR
-			printk(KERN_DEBUG "cyc:set_modem_info clearing "
-				"Z DTR\n");
-#endif
-		}
-		cy_writel(&ch_ctrl->rs_control, rs);
-		retval = cyz_issue_cmd(card, channel, C_CM_IOCTLM, 0L);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:set_modem_info retval on ttyC%d "
-				"was %x\n", info->line, retval);
-		}
-	}
-	return 0;
-}
-
-/*
- * cy_break() --- routine which turns the break handling on or off
- */
-static int cy_break(struct tty_struct *tty, int break_state)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	unsigned long flags;
-	int retval = 0;
-
-	if (serial_paranoia_check(info, tty->name, "cy_break"))
-		return -EINVAL;
-
-	card = info->card;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-	if (!cy_is_Z(card)) {
-		/* Let the transmit ISR take care of this (since it
-		   requires stuffing characters into the output stream).
-		 */
-		if (break_state == -1) {
-			if (!info->breakon) {
-				info->breakon = 1;
-				if (!info->xmit_cnt) {
-					spin_unlock_irqrestore(&card->card_lock, flags);
-					start_xmit(info);
-					spin_lock_irqsave(&card->card_lock, flags);
-				}
-			}
-		} else {
-			if (!info->breakoff) {
-				info->breakoff = 1;
-				if (!info->xmit_cnt) {
-					spin_unlock_irqrestore(&card->card_lock, flags);
-					start_xmit(info);
-					spin_lock_irqsave(&card->card_lock, flags);
-				}
-			}
-		}
-	} else {
-		if (break_state == -1) {
-			retval = cyz_issue_cmd(card,
-				info->line - card->first_line,
-				C_CM_SET_BREAK, 0L);
-			if (retval != 0) {
-				printk(KERN_ERR "cyc:cy_break (set) retval on "
-					"ttyC%d was %x\n", info->line, retval);
-			}
-		} else {
-			retval = cyz_issue_cmd(card,
-				info->line - card->first_line,
-				C_CM_CLR_BREAK, 0L);
-			if (retval != 0) {
-				printk(KERN_DEBUG "cyc:cy_break (clr) retval "
-					"on ttyC%d was %x\n", info->line,
-					retval);
-			}
-		}
-	}
-	spin_unlock_irqrestore(&card->card_lock, flags);
-	return retval;
-}				/* cy_break */
-
-static int set_threshold(struct cyclades_port *info, unsigned long value)
-{
-	struct cyclades_card *card = info->card;
-	unsigned long flags;
-
-	if (!cy_is_Z(card)) {
-		info->cor3 &= ~CyREC_FIFO;
-		info->cor3 |= value & CyREC_FIFO;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_writeb(info, CyCOR3, info->cor3);
-		cyy_issue_cmd(info, CyCOR_CHANGE | CyCOR3ch);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	}
-	return 0;
-}				/* set_threshold */
-
-static int get_threshold(struct cyclades_port *info,
-						unsigned long __user *value)
-{
-	struct cyclades_card *card = info->card;
-
-	if (!cy_is_Z(card)) {
-		u8 tmp = cyy_readb(info, CyCOR3) & CyREC_FIFO;
-		return put_user(tmp, value);
-	}
-	return 0;
-}				/* get_threshold */
-
-static int set_timeout(struct cyclades_port *info, unsigned long value)
-{
-	struct cyclades_card *card = info->card;
-	unsigned long flags;
-
-	if (!cy_is_Z(card)) {
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_writeb(info, CyRTPR, value & 0xff);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	}
-	return 0;
-}				/* set_timeout */
-
-static int get_timeout(struct cyclades_port *info,
-						unsigned long __user *value)
-{
-	struct cyclades_card *card = info->card;
-
-	if (!cy_is_Z(card)) {
-		u8 tmp = cyy_readb(info, CyRTPR);
-		return put_user(tmp, value);
-	}
-	return 0;
-}				/* get_timeout */
-
-static int cy_cflags_changed(struct cyclades_port *info, unsigned long arg,
-		struct cyclades_icount *cprev)
-{
-	struct cyclades_icount cnow;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&info->card->card_lock, flags);
-	cnow = info->icount;	/* atomic copy */
-	spin_unlock_irqrestore(&info->card->card_lock, flags);
-
-	ret =	((arg & TIOCM_RNG) && (cnow.rng != cprev->rng)) ||
-		((arg & TIOCM_DSR) && (cnow.dsr != cprev->dsr)) ||
-		((arg & TIOCM_CD)  && (cnow.dcd != cprev->dcd)) ||
-		((arg & TIOCM_CTS) && (cnow.cts != cprev->cts));
-
-	*cprev = cnow;
-
-	return ret;
-}
-
-/*
- * This routine allows the tty driver to implement device-
- * specific ioctl's.  If the ioctl number passed in cmd is
- * not recognized by the driver, it should return ENOIOCTLCMD.
- */
-static int
-cy_ioctl(struct tty_struct *tty,
-	 unsigned int cmd, unsigned long arg)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_icount cnow;	/* kernel counter temps */
-	int ret_val = 0;
-	unsigned long flags;
-	void __user *argp = (void __user *)arg;
-
-	if (serial_paranoia_check(info, tty->name, "cy_ioctl"))
-		return -ENODEV;
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_ioctl ttyC%d, cmd = %x arg = %lx\n",
-		info->line, cmd, arg);
-#endif
-
-	switch (cmd) {
-	case CYGETMON:
-		if (copy_to_user(argp, &info->mon, sizeof(info->mon))) {
-			ret_val = -EFAULT;
-			break;
-		}
-		memset(&info->mon, 0, sizeof(info->mon));
-		break;
-	case CYGETTHRESH:
-		ret_val = get_threshold(info, argp);
-		break;
-	case CYSETTHRESH:
-		ret_val = set_threshold(info, arg);
-		break;
-	case CYGETDEFTHRESH:
-		ret_val = put_user(info->default_threshold,
-				(unsigned long __user *)argp);
-		break;
-	case CYSETDEFTHRESH:
-		info->default_threshold = arg & 0x0f;
-		break;
-	case CYGETTIMEOUT:
-		ret_val = get_timeout(info, argp);
-		break;
-	case CYSETTIMEOUT:
-		ret_val = set_timeout(info, arg);
-		break;
-	case CYGETDEFTIMEOUT:
-		ret_val = put_user(info->default_timeout,
-				(unsigned long __user *)argp);
-		break;
-	case CYSETDEFTIMEOUT:
-		info->default_timeout = arg & 0xff;
-		break;
-	case CYSETRFLOW:
-		info->rflow = (int)arg;
-		break;
-	case CYGETRFLOW:
-		ret_val = info->rflow;
-		break;
-	case CYSETRTSDTR_INV:
-		info->rtsdtr_inv = (int)arg;
-		break;
-	case CYGETRTSDTR_INV:
-		ret_val = info->rtsdtr_inv;
-		break;
-	case CYGETCD1400VER:
-		ret_val = info->chip_rev;
-		break;
-#ifndef CONFIG_CYZ_INTR
-	case CYZSETPOLLCYCLE:
-		if (arg > LONG_MAX / HZ)
-			return -ENODEV;
-		cyz_polling_cycle = (arg * HZ) / 1000;
-		break;
-	case CYZGETPOLLCYCLE:
-		ret_val = (cyz_polling_cycle * 1000) / HZ;
-		break;
-#endif				/* CONFIG_CYZ_INTR */
-	case CYSETWAIT:
-		info->port.closing_wait = (unsigned short)arg * HZ / 100;
-		break;
-	case CYGETWAIT:
-		ret_val = info->port.closing_wait / (HZ / 100);
-		break;
-	case TIOCSERGETLSR:	/* Get line status register */
-		ret_val = get_lsr_info(info, argp);
-		break;
-		/*
-		 * Wait for any of the 4 modem inputs (DCD,RI,DSR,CTS) to change
-		 * - mask passed in arg for lines of interest
-		 *   (use |'ed TIOCM_RNG/DSR/CD/CTS for masking)
-		 * Caller should use TIOCGICOUNT to see which one it was
-		 */
-	case TIOCMIWAIT:
-		spin_lock_irqsave(&info->card->card_lock, flags);
-		/* note the counters on entry */
-		cnow = info->icount;
-		spin_unlock_irqrestore(&info->card->card_lock, flags);
-		ret_val = wait_event_interruptible(info->port.delta_msr_wait,
-				cy_cflags_changed(info, arg, &cnow));
-		break;
-
-		/*
-		 * Get counter of input serial line interrupts (DCD,RI,DSR,CTS)
-		 * Return: write counters to the user passed counter struct
-		 * NB: both 1->0 and 0->1 transitions are counted except for
-		 *     RI where only 0->1 is counted.
-		 */
-	default:
-		ret_val = -ENOIOCTLCMD;
-	}
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_ioctl done\n");
-#endif
-	return ret_val;
-}				/* cy_ioctl */
-
-static int cy_get_icount(struct tty_struct *tty,
-				struct serial_icounter_struct *sic)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_icount cnow;	/* Used to snapshot */
-	unsigned long flags;
-
-	spin_lock_irqsave(&info->card->card_lock, flags);
-	cnow = info->icount;
-	spin_unlock_irqrestore(&info->card->card_lock, flags);
-
-	sic->cts = cnow.cts;
-	sic->dsr = cnow.dsr;
-	sic->rng = cnow.rng;
-	sic->dcd = cnow.dcd;
-	sic->rx = cnow.rx;
-	sic->tx = cnow.tx;
-	sic->frame = cnow.frame;
-	sic->overrun = cnow.overrun;
-	sic->parity = cnow.parity;
-	sic->brk = cnow.brk;
-	sic->buf_overrun = cnow.buf_overrun;
-	return 0;
-}
-
-/*
- * This routine allows the tty driver to be notified when
- * device's termios settings have changed.  Note that a
- * well-designed tty driver should be prepared to accept the case
- * where old == NULL, and try to do something rational.
- */
-static void cy_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
-{
-	struct cyclades_port *info = tty->driver_data;
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_set_termios ttyC%d\n", info->line);
-#endif
-
-	cy_set_line_char(info, tty);
-
-	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
-		tty->hw_stopped = 0;
-		cy_start(tty);
-	}
-#if 0
-	/*
-	 * No need to wake up processes in open wait, since they
-	 * sample the CLOCAL flag once, and don't recheck it.
-	 * XXX  It's not clear whether the current behavior is correct
-	 * or not.  Hence, this may change.....
-	 */
-	if (!(old_termios->c_cflag & CLOCAL) && C_CLOCAL(tty))
-		wake_up_interruptible(&info->port.open_wait);
-#endif
-}				/* cy_set_termios */
-
-/* This function is used to send a high-priority XON/XOFF character to
-   the device.
-*/
-static void cy_send_xchar(struct tty_struct *tty, char ch)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	int channel;
-
-	if (serial_paranoia_check(info, tty->name, "cy_send_xchar"))
-		return;
-
-	info->x_char = ch;
-
-	if (ch)
-		cy_start(tty);
-
-	card = info->card;
-	channel = info->line - card->first_line;
-
-	if (cy_is_Z(card)) {
-		if (ch == STOP_CHAR(tty))
-			cyz_issue_cmd(card, channel, C_CM_SENDXOFF, 0L);
-		else if (ch == START_CHAR(tty))
-			cyz_issue_cmd(card, channel, C_CM_SENDXON, 0L);
-	}
-}
-
-/* This routine is called by the upper-layer tty layer to signal
-   that incoming characters should be throttled because the input
-   buffers are close to full.
- */
-static void cy_throttle(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_THROTTLE
-	printk(KERN_DEBUG "cyc:throttle %s ...ttyC%d\n", tty_name(tty),
-			 info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_throttle"))
-		return;
-
-	card = info->card;
-
-	if (I_IXOFF(tty)) {
-		if (!cy_is_Z(card))
-			cy_send_xchar(tty, STOP_CHAR(tty));
-		else
-			info->throttle = 1;
-	}
-
-	if (C_CRTSCTS(tty)) {
-		if (!cy_is_Z(card)) {
-			spin_lock_irqsave(&card->card_lock, flags);
-			cyy_change_rts_dtr(info, 0, TIOCM_RTS);
-			spin_unlock_irqrestore(&card->card_lock, flags);
-		} else {
-			info->throttle = 1;
-		}
-	}
-}				/* cy_throttle */
-
-/*
- * This routine notifies the tty driver that it should signal
- * that characters can now be sent to the tty without fear of
- * overrunning the input buffers of the line disciplines.
- */
-static void cy_unthrottle(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_THROTTLE
-	printk(KERN_DEBUG "cyc:unthrottle %s ...ttyC%d\n",
-		tty_name(tty), info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_unthrottle"))
-		return;
-
-	if (I_IXOFF(tty)) {
-		if (info->x_char)
-			info->x_char = 0;
-		else
-			cy_send_xchar(tty, START_CHAR(tty));
-	}
-
-	if (C_CRTSCTS(tty)) {
-		card = info->card;
-		if (!cy_is_Z(card)) {
-			spin_lock_irqsave(&card->card_lock, flags);
-			cyy_change_rts_dtr(info, TIOCM_RTS, 0);
-			spin_unlock_irqrestore(&card->card_lock, flags);
-		} else {
-			info->throttle = 0;
-		}
-	}
-}				/* cy_unthrottle */
-
-/* cy_start and cy_stop provide software output flow control as a
-   function of XON/XOFF, software CTS, and other such stuff.
-*/
-static void cy_stop(struct tty_struct *tty)
-{
-	struct cyclades_card *cinfo;
-	struct cyclades_port *info = tty->driver_data;
-	int channel;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_stop ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_stop"))
-		return;
-
-	cinfo = info->card;
-	channel = info->line - cinfo->first_line;
-	if (!cy_is_Z(cinfo)) {
-		spin_lock_irqsave(&cinfo->card_lock, flags);
-		cyy_writeb(info, CyCAR, channel & 0x03);
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyTxRdy);
-		spin_unlock_irqrestore(&cinfo->card_lock, flags);
-	}
-}				/* cy_stop */
-
-static void cy_start(struct tty_struct *tty)
-{
-	struct cyclades_card *cinfo;
-	struct cyclades_port *info = tty->driver_data;
-	int channel;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_start ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_start"))
-		return;
-
-	cinfo = info->card;
-	channel = info->line - cinfo->first_line;
-	if (!cy_is_Z(cinfo)) {
-		spin_lock_irqsave(&cinfo->card_lock, flags);
-		cyy_writeb(info, CyCAR, channel & 0x03);
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyTxRdy);
-		spin_unlock_irqrestore(&cinfo->card_lock, flags);
-	}
-}				/* cy_start */
-
-/*
- * cy_hangup() --- called by tty_hangup() when a hangup is signaled.
- */
-static void cy_hangup(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_hangup ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_hangup"))
-		return;
-
-	cy_flush_buffer(tty);
-	cy_shutdown(info, tty);
-	tty_port_hangup(&info->port);
-}				/* cy_hangup */
-
-static int cyy_carrier_raised(struct tty_port *port)
-{
-	struct cyclades_port *info = container_of(port, struct cyclades_port,
-			port);
-	struct cyclades_card *cinfo = info->card;
-	unsigned long flags;
-	int channel = info->line - cinfo->first_line;
-	u32 cd;
-
-	spin_lock_irqsave(&cinfo->card_lock, flags);
-	cyy_writeb(info, CyCAR, channel & 0x03);
-	cd = cyy_readb(info, CyMSVR1) & CyDCD;
-	spin_unlock_irqrestore(&cinfo->card_lock, flags);
-
-	return cd;
-}
-
-static void cyy_dtr_rts(struct tty_port *port, int raise)
-{
-	struct cyclades_port *info = container_of(port, struct cyclades_port,
-			port);
-	struct cyclades_card *cinfo = info->card;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cinfo->card_lock, flags);
-	cyy_change_rts_dtr(info, raise ? TIOCM_RTS | TIOCM_DTR : 0,
-			raise ? 0 : TIOCM_RTS | TIOCM_DTR);
-	spin_unlock_irqrestore(&cinfo->card_lock, flags);
-}
-
-static int cyz_carrier_raised(struct tty_port *port)
-{
-	struct cyclades_port *info = container_of(port, struct cyclades_port,
-			port);
-
-	return readl(&info->u.cyz.ch_ctrl->rs_status) & C_RS_DCD;
-}
-
-static void cyz_dtr_rts(struct tty_port *port, int raise)
-{
-	struct cyclades_port *info = container_of(port, struct cyclades_port,
-			port);
-	struct cyclades_card *cinfo = info->card;
-	struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-	int ret, channel = info->line - cinfo->first_line;
-	u32 rs;
-
-	rs = readl(&ch_ctrl->rs_control);
-	if (raise)
-		rs |= C_RS_RTS | C_RS_DTR;
-	else
-		rs &= ~(C_RS_RTS | C_RS_DTR);
-	cy_writel(&ch_ctrl->rs_control, rs);
-	ret = cyz_issue_cmd(cinfo, channel, C_CM_IOCTLM, 0L);
-	if (ret != 0)
-		printk(KERN_ERR "%s: retval on ttyC%d was %x\n",
-				__func__, info->line, ret);
-#ifdef CY_DEBUG_DTR
-	printk(KERN_DEBUG "%s: raising Z DTR\n", __func__);
-#endif
-}
-
-static const struct tty_port_operations cyy_port_ops = {
-	.carrier_raised = cyy_carrier_raised,
-	.dtr_rts = cyy_dtr_rts,
-	.shutdown = cy_do_close,
-};
-
-static const struct tty_port_operations cyz_port_ops = {
-	.carrier_raised = cyz_carrier_raised,
-	.dtr_rts = cyz_dtr_rts,
-	.shutdown = cy_do_close,
-};
-
-/*
- * ---------------------------------------------------------------------
- * cy_init() and friends
- *
- * cy_init() is called at boot-time to initialize the serial driver.
- * ---------------------------------------------------------------------
- */
-
-static int cy_init_card(struct cyclades_card *cinfo)
-{
-	struct cyclades_port *info;
-	unsigned int channel, port;
-
-	spin_lock_init(&cinfo->card_lock);
-	cinfo->intr_enabled = 0;
-
-	cinfo->ports = kcalloc(cinfo->nports, sizeof(*cinfo->ports),
-			GFP_KERNEL);
-	if (cinfo->ports == NULL) {
-		printk(KERN_ERR "Cyclades: cannot allocate ports\n");
-		return -ENOMEM;
-	}
-
-	for (channel = 0, port = cinfo->first_line; channel < cinfo->nports;
-			channel++, port++) {
-		info = &cinfo->ports[channel];
-		tty_port_init(&info->port);
-		info->magic = CYCLADES_MAGIC;
-		info->card = cinfo;
-		info->line = port;
-
-		info->port.closing_wait = CLOSING_WAIT_DELAY;
-		info->port.close_delay = 5 * HZ / 10;
-		init_completion(&info->shutdown_wait);
-
-		if (cy_is_Z(cinfo)) {
-			struct FIRM_ID *firm_id = cinfo->base_addr + ID_ADDRESS;
-			struct ZFW_CTRL *zfw_ctrl;
-
-			info->port.ops = &cyz_port_ops;
-			info->type = PORT_STARTECH;
-
-			zfw_ctrl = cinfo->base_addr +
-				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-			info->u.cyz.ch_ctrl = &zfw_ctrl->ch_ctrl[channel];
-			info->u.cyz.buf_ctrl = &zfw_ctrl->buf_ctrl[channel];
-
-			if (cinfo->hw_ver == ZO_V1)
-				info->xmit_fifo_size = CYZ_FIFO_SIZE;
-			else
-				info->xmit_fifo_size = 4 * CYZ_FIFO_SIZE;
-#ifdef CONFIG_CYZ_INTR
-			timer_setup(&info->rx_full_timer, cyz_rx_restart, 0);
-#endif
-		} else {
-			unsigned short chip_number;
-			int index = cinfo->bus_index;
-
-			info->port.ops = &cyy_port_ops;
-			info->type = PORT_CIRRUS;
-			info->xmit_fifo_size = CyMAX_CHAR_FIFO;
-			info->cor1 = CyPARITY_NONE | Cy_1_STOP | Cy_8_BITS;
-			info->cor2 = CyETC;
-			info->cor3 = 0x08;	/* _very_ small rcv threshold */
-
-			chip_number = channel / CyPORTS_PER_CHIP;
-			info->u.cyy.base_addr = cinfo->base_addr +
-				(cy_chip_offset[chip_number] << index);
-			info->chip_rev = cyy_readb(info, CyGFRCR);
-
-			if (info->chip_rev >= CD1400_REV_J) {
-				/* It is a CD1400 rev. J or later */
-				info->tbpr = baud_bpr_60[13];	/* Tx BPR */
-				info->tco = baud_co_60[13];	/* Tx CO */
-				info->rbpr = baud_bpr_60[13];	/* Rx BPR */
-				info->rco = baud_co_60[13];	/* Rx CO */
-				info->rtsdtr_inv = 1;
-			} else {
-				info->tbpr = baud_bpr_25[13];	/* Tx BPR */
-				info->tco = baud_co_25[13];	/* Tx CO */
-				info->rbpr = baud_bpr_25[13];	/* Rx BPR */
-				info->rco = baud_co_25[13];	/* Rx CO */
-				info->rtsdtr_inv = 0;
-			}
-			info->read_status_mask = CyTIMEOUT | CySPECHAR |
-				CyBREAK | CyPARITY | CyFRAME | CyOVERRUN;
-		}
-
-	}
-
-#ifndef CONFIG_CYZ_INTR
-	if (cy_is_Z(cinfo) && !timer_pending(&cyz_timerlist)) {
-		mod_timer(&cyz_timerlist, jiffies + 1);
-#ifdef CY_PCI_DEBUG
-		printk(KERN_DEBUG "Cyclades-Z polling initialized\n");
-#endif
-	}
-#endif
-	return 0;
-}
-
-/* initialize chips on Cyclom-Y card -- return number of valid
-   chips (which is number of ports/4) */
-static unsigned short cyy_init_card(void __iomem *true_base_addr,
-		int index)
-{
-	unsigned int chip_number;
-	void __iomem *base_addr;
-
-	cy_writeb(true_base_addr + (Cy_HwReset << index), 0);
-	/* Cy_HwReset is 0x1400 */
-	cy_writeb(true_base_addr + (Cy_ClrIntr << index), 0);
-	/* Cy_ClrIntr is 0x1800 */
-	udelay(500L);
-
-	for (chip_number = 0; chip_number < CyMAX_CHIPS_PER_CARD;
-							chip_number++) {
-		base_addr =
-		    true_base_addr + (cy_chip_offset[chip_number] << index);
-		mdelay(1);
-		if (readb(base_addr + (CyCCR << index)) != 0x00) {
-			/*************
-			printk(" chip #%d at %#6lx is never idle (CCR != 0)\n",
-			chip_number, (unsigned long)base_addr);
-			*************/
-			return chip_number;
-		}
-
-		cy_writeb(base_addr + (CyGFRCR << index), 0);
-		udelay(10L);
-
-		/* The Cyclom-16Y does not decode address bit 9 and therefore
-		   cannot distinguish between references to chip 0 and a non-
-		   existent chip 4.  If the preceding clearing of the supposed
-		   chip 4 GFRCR register appears at chip 0, there is no chip 4
-		   and this must be a Cyclom-16Y, not a Cyclom-32Ye.
-		 */
-		if (chip_number == 4 && readb(true_base_addr +
-				(cy_chip_offset[0] << index) +
-				(CyGFRCR << index)) == 0) {
-			return chip_number;
-		}
-
-		cy_writeb(base_addr + (CyCCR << index), CyCHIP_RESET);
-		mdelay(1);
-
-		if (readb(base_addr + (CyGFRCR << index)) == 0x00) {
-			/*
-			   printk(" chip #%d at %#6lx is not responding ",
-			   chip_number, (unsigned long)base_addr);
-			   printk("(GFRCR stayed 0)\n",
-			 */
-			return chip_number;
-		}
-		if ((0xf0 & (readb(base_addr + (CyGFRCR << index)))) !=
-				0x40) {
-			/*
-			printk(" chip #%d at %#6lx is not valid (GFRCR == "
-					"%#2x)\n",
-					chip_number, (unsigned long)base_addr,
-					base_addr[CyGFRCR<<index]);
-			 */
-			return chip_number;
-		}
-		cy_writeb(base_addr + (CyGCR << index), CyCH0_SERIAL);
-		if (readb(base_addr + (CyGFRCR << index)) >= CD1400_REV_J) {
-			/* It is a CD1400 rev. J or later */
-			/* Impossible to reach 5ms with this chip.
-			   Changed to 2ms instead (f = 500 Hz). */
-			cy_writeb(base_addr + (CyPPR << index), CyCLOCK_60_2MS);
-		} else {
-			/* f = 200 Hz */
-			cy_writeb(base_addr + (CyPPR << index), CyCLOCK_25_5MS);
-		}
-
-		/*
-		   printk(" chip #%d at %#6lx is rev 0x%2x\n",
-		   chip_number, (unsigned long)base_addr,
-		   readb(base_addr+(CyGFRCR<<index)));
-		 */
-	}
-	return chip_number;
-}				/* cyy_init_card */
-
-/*
- * ---------------------------------------------------------------------
- * cy_detect_isa() - Probe for Cyclom-Y/ISA boards.
- * sets global variables and return the number of ISA boards found.
- * ---------------------------------------------------------------------
- */
-static int __init cy_detect_isa(void)
-{
-#ifdef CONFIG_ISA
-	struct cyclades_card *card;
-	unsigned short cy_isa_irq, nboard;
-	void __iomem *cy_isa_address;
-	unsigned short i, j, k, cy_isa_nchan;
-	int isparam = 0;
-
-	nboard = 0;
-
-	/* Check for module parameters */
-	for (i = 0; i < NR_CARDS; i++) {
-		if (maddr[i] || i) {
-			isparam = 1;
-			cy_isa_addresses[i] = maddr[i];
-		}
-		if (!maddr[i])
-			break;
-	}
-
-	/* scan the address table probing for Cyclom-Y/ISA boards */
-	for (i = 0; i < NR_ISA_ADDRS; i++) {
-		unsigned int isa_address = cy_isa_addresses[i];
-		if (isa_address == 0x0000)
-			return nboard;
-
-		/* probe for CD1400... */
-		cy_isa_address = ioremap(isa_address, CyISA_Ywin);
-		if (cy_isa_address == NULL) {
-			printk(KERN_ERR "Cyclom-Y/ISA: can't remap base "
-					"address\n");
-			continue;
-		}
-		cy_isa_nchan = CyPORTS_PER_CHIP *
-			cyy_init_card(cy_isa_address, 0);
-		if (cy_isa_nchan == 0) {
-			iounmap(cy_isa_address);
-			continue;
-		}
-
-		if (isparam && i < NR_CARDS && irq[i])
-			cy_isa_irq = irq[i];
-		else
-			/* find out the board's irq by probing */
-			cy_isa_irq = detect_isa_irq(cy_isa_address);
-		if (cy_isa_irq == 0) {
-			printk(KERN_ERR "Cyclom-Y/ISA found at 0x%lx, but the "
-				"IRQ could not be detected.\n",
-				(unsigned long)cy_isa_address);
-			iounmap(cy_isa_address);
-			continue;
-		}
-
-		if ((cy_next_channel + cy_isa_nchan) > NR_PORTS) {
-			printk(KERN_ERR "Cyclom-Y/ISA found at 0x%lx, but no "
-				"more channels are available. Change NR_PORTS "
-				"in cyclades.c and recompile kernel.\n",
-				(unsigned long)cy_isa_address);
-			iounmap(cy_isa_address);
-			return nboard;
-		}
-		/* fill the next cy_card structure available */
-		for (j = 0; j < NR_CARDS; j++) {
-			card = &cy_card[j];
-			if (card->base_addr == NULL)
-				break;
-		}
-		if (j == NR_CARDS) {	/* no more cy_cards available */
-			printk(KERN_ERR "Cyclom-Y/ISA found at 0x%lx, but no "
-				"more cards can be used. Change NR_CARDS in "
-				"cyclades.c and recompile kernel.\n",
-				(unsigned long)cy_isa_address);
-			iounmap(cy_isa_address);
-			return nboard;
-		}
-
-		/* allocate IRQ */
-		if (request_irq(cy_isa_irq, cyy_interrupt,
-				0, "Cyclom-Y", card)) {
-			printk(KERN_ERR "Cyclom-Y/ISA found at 0x%lx, but "
-				"could not allocate IRQ#%d.\n",
-				(unsigned long)cy_isa_address, cy_isa_irq);
-			iounmap(cy_isa_address);
-			return nboard;
-		}
-
-		/* set cy_card */
-		card->base_addr = cy_isa_address;
-		card->ctl_addr.p9050 = NULL;
-		card->irq = (int)cy_isa_irq;
-		card->bus_index = 0;
-		card->first_line = cy_next_channel;
-		card->num_chips = cy_isa_nchan / CyPORTS_PER_CHIP;
-		card->nports = cy_isa_nchan;
-		if (cy_init_card(card)) {
-			card->base_addr = NULL;
-			free_irq(cy_isa_irq, card);
-			iounmap(cy_isa_address);
-			continue;
-		}
-		nboard++;
-
-		printk(KERN_INFO "Cyclom-Y/ISA #%d: 0x%lx-0x%lx, IRQ%d found: "
-			"%d channels starting from port %d\n",
-			j + 1, (unsigned long)cy_isa_address,
-			(unsigned long)(cy_isa_address + (CyISA_Ywin - 1)),
-			cy_isa_irq, cy_isa_nchan, cy_next_channel);
-
-		for (k = 0, j = cy_next_channel;
-				j < cy_next_channel + cy_isa_nchan; j++, k++)
-			tty_port_register_device(&card->ports[k].port,
-					cy_serial_driver, j, NULL);
-		cy_next_channel += cy_isa_nchan;
-	}
-	return nboard;
-#else
-	return 0;
-#endif				/* CONFIG_ISA */
-}				/* cy_detect_isa */
-
-#ifdef CONFIG_PCI
-static inline int cyc_isfwstr(const char *str, unsigned int size)
-{
-	unsigned int a;
-
-	for (a = 0; a < size && *str; a++, str++)
-		if (*str & 0x80)
-			return -EINVAL;
-
-	for (; a < size; a++, str++)
-		if (*str)
-			return -EINVAL;
-
-	return 0;
-}
-
-static inline void cyz_fpga_copy(void __iomem *fpga, const u8 *data,
-		unsigned int size)
-{
-	for (; size > 0; size--) {
-		cy_writel(fpga, *data++);
-		udelay(10);
-	}
-}
-
-static void plx_init(struct pci_dev *pdev, int irq,
-		struct RUNTIME_9060 __iomem *addr)
-{
-	/* Reset PLX */
-	cy_writel(&addr->init_ctrl, readl(&addr->init_ctrl) | 0x40000000);
-	udelay(100L);
-	cy_writel(&addr->init_ctrl, readl(&addr->init_ctrl) & ~0x40000000);
-
-	/* Reload Config. Registers from EEPROM */
-	cy_writel(&addr->init_ctrl, readl(&addr->init_ctrl) | 0x20000000);
-	udelay(100L);
-	cy_writel(&addr->init_ctrl, readl(&addr->init_ctrl) & ~0x20000000);
-
-	/* For some yet unknown reason, once the PLX9060 reloads the EEPROM,
-	 * the IRQ is lost and, thus, we have to re-write it to the PCI config.
-	 * registers. This will remain here until we find a permanent fix.
-	 */
-	pci_write_config_byte(pdev, PCI_INTERRUPT_LINE, irq);
-}
-
-static int __cyz_load_fw(const struct firmware *fw,
-		const char *name, const u32 mailbox, void __iomem *base,
-		void __iomem *fpga)
-{
-	const void *ptr = fw->data;
-	const struct zfile_header *h = ptr;
-	const struct zfile_config *c, *cs;
-	const struct zfile_block *b, *bs;
-	unsigned int a, tmp, len = fw->size;
-#define BAD_FW KERN_ERR "Bad firmware: "
-	if (len < sizeof(*h)) {
-		printk(BAD_FW "too short: %u<%zu\n", len, sizeof(*h));
-		return -EINVAL;
-	}
-
-	cs = ptr + h->config_offset;
-	bs = ptr + h->block_offset;
-
-	if ((void *)(cs + h->n_config) > ptr + len ||
-			(void *)(bs + h->n_blocks) > ptr + len) {
-		printk(BAD_FW "too short");
-		return  -EINVAL;
-	}
-
-	if (cyc_isfwstr(h->name, sizeof(h->name)) ||
-			cyc_isfwstr(h->date, sizeof(h->date))) {
-		printk(BAD_FW "bad formatted header string\n");
-		return -EINVAL;
-	}
-
-	if (strncmp(name, h->name, sizeof(h->name))) {
-		printk(BAD_FW "bad name '%s' (expected '%s')\n", h->name, name);
-		return -EINVAL;
-	}
-
-	tmp = 0;
-	for (c = cs; c < cs + h->n_config; c++) {
-		for (a = 0; a < c->n_blocks; a++)
-			if (c->block_list[a] > h->n_blocks) {
-				printk(BAD_FW "bad block ref number in cfgs\n");
-				return -EINVAL;
-			}
-		if (c->mailbox == mailbox && c->function == 0) /* 0 is normal */
-			tmp++;
-	}
-	if (!tmp) {
-		printk(BAD_FW "nothing appropriate\n");
-		return -EINVAL;
-	}
-
-	for (b = bs; b < bs + h->n_blocks; b++)
-		if (b->file_offset + b->size > len) {
-			printk(BAD_FW "bad block data offset\n");
-			return -EINVAL;
-		}
-
-	/* everything is OK, let's seek'n'load it */
-	for (c = cs; c < cs + h->n_config; c++)
-		if (c->mailbox == mailbox && c->function == 0)
-			break;
-
-	for (a = 0; a < c->n_blocks; a++) {
-		b = &bs[c->block_list[a]];
-		if (b->type == ZBLOCK_FPGA) {
-			if (fpga != NULL)
-				cyz_fpga_copy(fpga, ptr + b->file_offset,
-						b->size);
-		} else {
-			if (base != NULL)
-				memcpy_toio(base + b->ram_offset,
-					       ptr + b->file_offset, b->size);
-		}
-	}
-#undef BAD_FW
-	return 0;
-}
-
-static int cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
-		struct RUNTIME_9060 __iomem *ctl_addr, int irq)
-{
-	const struct firmware *fw;
-	struct FIRM_ID __iomem *fid = base_addr + ID_ADDRESS;
-	struct CUSTOM_REG __iomem *cust = base_addr;
-	struct ZFW_CTRL __iomem *pt_zfwctrl;
-	void __iomem *tmp;
-	u32 mailbox, status, nchan;
-	unsigned int i;
-	int retval;
-
-	retval = request_firmware(&fw, "cyzfirm.bin", &pdev->dev);
-	if (retval) {
-		dev_err(&pdev->dev, "can't get firmware\n");
-		goto err;
-	}
-
-	/* Check whether the firmware is already loaded and running. If
-	   positive, skip this board */
-	if (__cyz_fpga_loaded(ctl_addr) && readl(&fid->signature) == ZFIRM_ID) {
-		u32 cntval = readl(base_addr + 0x190);
-
-		udelay(100);
-		if (cntval != readl(base_addr + 0x190)) {
-			/* FW counter is working, FW is running */
-			dev_dbg(&pdev->dev, "Cyclades-Z FW already loaded. "
-					"Skipping board.\n");
-			retval = 0;
-			goto err_rel;
-		}
-	}
-
-	/* start boot */
-	cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) &
-			~0x00030800UL);
-
-	mailbox = readl(&ctl_addr->mail_box_0);
-
-	if (mailbox == 0 || __cyz_fpga_loaded(ctl_addr)) {
-		/* stops CPU and set window to beginning of RAM */
-		cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
-		cy_writel(&cust->cpu_stop, 0);
-		cy_writel(&ctl_addr->loc_addr_base, WIN_RAM);
-		udelay(100);
-	}
-
-	plx_init(pdev, irq, ctl_addr);
-
-	if (mailbox != 0) {
-		/* load FPGA */
-		retval = __cyz_load_fw(fw, "Cyclom-Z", mailbox, NULL,
-				base_addr);
-		if (retval)
-			goto err_rel;
-		if (!__cyz_fpga_loaded(ctl_addr)) {
-			dev_err(&pdev->dev, "fw upload successful, but fw is "
-					"not loaded\n");
-			goto err_rel;
-		}
-	}
-
-	/* stops CPU and set window to beginning of RAM */
-	cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
-	cy_writel(&cust->cpu_stop, 0);
-	cy_writel(&ctl_addr->loc_addr_base, WIN_RAM);
-	udelay(100);
-
-	/* clear memory */
-	for (tmp = base_addr; tmp < base_addr + RAM_SIZE; tmp++)
-		cy_writeb(tmp, 255);
-	if (mailbox != 0) {
-		/* set window to last 512K of RAM */
-		cy_writel(&ctl_addr->loc_addr_base, WIN_RAM + RAM_SIZE);
-		for (tmp = base_addr; tmp < base_addr + RAM_SIZE; tmp++)
-			cy_writeb(tmp, 255);
-		/* set window to beginning of RAM */
-		cy_writel(&ctl_addr->loc_addr_base, WIN_RAM);
-	}
-
-	retval = __cyz_load_fw(fw, "Cyclom-Z", mailbox, base_addr, NULL);
-	release_firmware(fw);
-	if (retval)
-		goto err;
-
-	/* finish boot and start boards */
-	cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
-	cy_writel(&cust->cpu_start, 0);
-	cy_writel(&ctl_addr->loc_addr_base, WIN_RAM);
-	i = 0;
-	while ((status = readl(&fid->signature)) != ZFIRM_ID && i++ < 40)
-		msleep(100);
-	if (status != ZFIRM_ID) {
-		if (status == ZFIRM_HLT) {
-			dev_err(&pdev->dev, "you need an external power supply "
-				"for this number of ports. Firmware halted and "
-				"board reset.\n");
-			retval = -EIO;
-			goto err;
-		}
-		dev_warn(&pdev->dev, "fid->signature = 0x%x... Waiting "
-				"some more time\n", status);
-		while ((status = readl(&fid->signature)) != ZFIRM_ID &&
-				i++ < 200)
-			msleep(100);
-		if (status != ZFIRM_ID) {
-			dev_err(&pdev->dev, "Board not started in 20 seconds! "
-					"Giving up. (fid->signature = 0x%x)\n",
-					status);
-			dev_info(&pdev->dev, "*** Warning ***: if you are "
-				"upgrading the FW, please power cycle the "
-				"system before loading the new FW to the "
-				"Cyclades-Z.\n");
-
-			if (__cyz_fpga_loaded(ctl_addr))
-				plx_init(pdev, irq, ctl_addr);
-
-			retval = -EIO;
-			goto err;
-		}
-		dev_dbg(&pdev->dev, "Firmware started after %d seconds.\n",
-				i / 10);
-	}
-	pt_zfwctrl = base_addr + readl(&fid->zfwctrl_addr);
-
-	dev_dbg(&pdev->dev, "fid=> %p, zfwctrl_addr=> %x, npt_zfwctrl=> %p\n",
-			base_addr + ID_ADDRESS, readl(&fid->zfwctrl_addr),
-			base_addr + readl(&fid->zfwctrl_addr));
-
-	nchan = readl(&pt_zfwctrl->board_ctrl.n_channel);
-	dev_info(&pdev->dev, "Cyclades-Z FW loaded: version = %x, ports = %u\n",
-		readl(&pt_zfwctrl->board_ctrl.fw_version), nchan);
-
-	if (nchan == 0) {
-		dev_warn(&pdev->dev, "no Cyclades-Z ports were found. Please "
-			"check the connection between the Z host card and the "
-			"serial expanders.\n");
-
-		if (__cyz_fpga_loaded(ctl_addr))
-			plx_init(pdev, irq, ctl_addr);
-
-		dev_info(&pdev->dev, "Null number of ports detected. Board "
-				"reset.\n");
-		retval = 0;
-		goto err;
-	}
-
-	cy_writel(&pt_zfwctrl->board_ctrl.op_system, C_OS_LINUX);
-	cy_writel(&pt_zfwctrl->board_ctrl.dr_version, DRIVER_VERSION);
-
-	/*
-	   Early firmware failed to start looking for commands.
-	   This enables firmware interrupts for those commands.
-	 */
-	cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) |
-			(1 << 17));
-	cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) |
-			0x00030800UL);
-
-	return nchan;
-err_rel:
-	release_firmware(fw);
-err:
-	return retval;
-}
-
-static int cy_pci_probe(struct pci_dev *pdev,
-		const struct pci_device_id *ent)
-{
-	struct cyclades_card *card;
-	void __iomem *addr0 = NULL, *addr2 = NULL;
-	char *card_name = NULL;
-	u32 mailbox;
-	unsigned int device_id, nchan = 0, card_no, i, j;
-	unsigned char plx_ver;
-	int retval, irq;
-
-	retval = pci_enable_device(pdev);
-	if (retval) {
-		dev_err(&pdev->dev, "cannot enable device\n");
-		goto err;
-	}
-
-	/* read PCI configuration area */
-	irq = pdev->irq;
-	device_id = pdev->device & ~PCI_DEVICE_ID_MASK;
-
-#if defined(__alpha__)
-	if (device_id == PCI_DEVICE_ID_CYCLOM_Y_Lo) {	/* below 1M? */
-		dev_err(&pdev->dev, "Cyclom-Y/PCI not supported for low "
-			"addresses on Alpha systems.\n");
-		retval = -EIO;
-		goto err_dis;
-	}
-#endif
-	if (device_id == PCI_DEVICE_ID_CYCLOM_Z_Lo) {
-		dev_err(&pdev->dev, "Cyclades-Z/PCI not supported for low "
-			"addresses\n");
-		retval = -EIO;
-		goto err_dis;
-	}
-
-	if (pci_resource_flags(pdev, 2) & IORESOURCE_IO) {
-		dev_warn(&pdev->dev, "PCI I/O bit incorrectly set. Ignoring "
-				"it...\n");
-		pdev->resource[2].flags &= ~IORESOURCE_IO;
-	}
-
-	retval = pci_request_regions(pdev, "cyclades");
-	if (retval) {
-		dev_err(&pdev->dev, "failed to reserve resources\n");
-		goto err_dis;
-	}
-
-	retval = -EIO;
-	if (device_id == PCI_DEVICE_ID_CYCLOM_Y_Lo ||
-			device_id == PCI_DEVICE_ID_CYCLOM_Y_Hi) {
-		card_name = "Cyclom-Y";
-
-		addr0 = ioremap(pci_resource_start(pdev, 0),
-				CyPCI_Yctl);
-		if (addr0 == NULL) {
-			dev_err(&pdev->dev, "can't remap ctl region\n");
-			goto err_reg;
-		}
-		addr2 = ioremap(pci_resource_start(pdev, 2),
-				CyPCI_Ywin);
-		if (addr2 == NULL) {
-			dev_err(&pdev->dev, "can't remap base region\n");
-			goto err_unmap;
-		}
-
-		nchan = CyPORTS_PER_CHIP * cyy_init_card(addr2, 1);
-		if (nchan == 0) {
-			dev_err(&pdev->dev, "Cyclom-Y PCI host card with no "
-					"Serial-Modules\n");
-			goto err_unmap;
-		}
-	} else if (device_id == PCI_DEVICE_ID_CYCLOM_Z_Hi) {
-		struct RUNTIME_9060 __iomem *ctl_addr;
-
-		ctl_addr = addr0 = ioremap(pci_resource_start(pdev, 0),
-				CyPCI_Zctl);
-		if (addr0 == NULL) {
-			dev_err(&pdev->dev, "can't remap ctl region\n");
-			goto err_reg;
-		}
-
-		/* Disable interrupts on the PLX before resetting it */
-		cy_writew(&ctl_addr->intr_ctrl_stat,
-				readw(&ctl_addr->intr_ctrl_stat) & ~0x0900);
-
-		plx_init(pdev, irq, addr0);
-
-		mailbox = readl(&ctl_addr->mail_box_0);
-
-		addr2 = ioremap(pci_resource_start(pdev, 2),
-				mailbox == ZE_V1 ? CyPCI_Ze_win : CyPCI_Zwin);
-		if (addr2 == NULL) {
-			dev_err(&pdev->dev, "can't remap base region\n");
-			goto err_unmap;
-		}
-
-		if (mailbox == ZE_V1) {
-			card_name = "Cyclades-Ze";
-		} else {
-			card_name = "Cyclades-8Zo";
-#ifdef CY_PCI_DEBUG
-			if (mailbox == ZO_V1) {
-				cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
-				dev_info(&pdev->dev, "Cyclades-8Zo/PCI: FPGA "
-					"id %lx, ver %lx\n", (ulong)(0xff &
-					readl(&((struct CUSTOM_REG *)addr2)->
-						fpga_id)), (ulong)(0xff &
-					readl(&((struct CUSTOM_REG *)addr2)->
-						fpga_version)));
-				cy_writel(&ctl_addr->loc_addr_base, WIN_RAM);
-			} else {
-				dev_info(&pdev->dev, "Cyclades-Z/PCI: New "
-					"Cyclades-Z board.  FPGA not loaded\n");
-			}
-#endif
-			/* The following clears the firmware id word.  This
-			   ensures that the driver will not attempt to talk to
-			   the board until it has been properly initialized.
-			 */
-			if ((mailbox == ZO_V1) || (mailbox == ZO_V2))
-				cy_writel(addr2 + ID_ADDRESS, 0L);
-		}
-
-		retval = cyz_load_fw(pdev, addr2, addr0, irq);
-		if (retval <= 0)
-			goto err_unmap;
-		nchan = retval;
-	}
-
-	if ((cy_next_channel + nchan) > NR_PORTS) {
-		dev_err(&pdev->dev, "Cyclades-8Zo/PCI found, but no "
-			"channels are available. Change NR_PORTS in "
-			"cyclades.c and recompile kernel.\n");
-		goto err_unmap;
-	}
-	/* fill the next cy_card structure available */
-	for (card_no = 0; card_no < NR_CARDS; card_no++) {
-		card = &cy_card[card_no];
-		if (card->base_addr == NULL)
-			break;
-	}
-	if (card_no == NR_CARDS) {	/* no more cy_cards available */
-		dev_err(&pdev->dev, "Cyclades-8Zo/PCI found, but no "
-			"more cards can be used. Change NR_CARDS in "
-			"cyclades.c and recompile kernel.\n");
-		goto err_unmap;
-	}
-
-	if (device_id == PCI_DEVICE_ID_CYCLOM_Y_Lo ||
-			device_id == PCI_DEVICE_ID_CYCLOM_Y_Hi) {
-		/* allocate IRQ */
-		retval = request_irq(irq, cyy_interrupt,
-				IRQF_SHARED, "Cyclom-Y", card);
-		if (retval) {
-			dev_err(&pdev->dev, "could not allocate IRQ\n");
-			goto err_unmap;
-		}
-		card->num_chips = nchan / CyPORTS_PER_CHIP;
-	} else {
-		struct FIRM_ID __iomem *firm_id = addr2 + ID_ADDRESS;
-		struct ZFW_CTRL __iomem *zfw_ctrl;
-
-		zfw_ctrl = addr2 + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
-
-		card->hw_ver = mailbox;
-		card->num_chips = (unsigned int)-1;
-		card->board_ctrl = &zfw_ctrl->board_ctrl;
-#ifdef CONFIG_CYZ_INTR
-		/* allocate IRQ only if board has an IRQ */
-		if (irq != 0 && irq != 255) {
-			retval = request_irq(irq, cyz_interrupt,
-					IRQF_SHARED, "Cyclades-Z", card);
-			if (retval) {
-				dev_err(&pdev->dev, "could not allocate IRQ\n");
-				goto err_unmap;
-			}
-		}
-#endif				/* CONFIG_CYZ_INTR */
-	}
-
-	/* set cy_card */
-	card->base_addr = addr2;
-	card->ctl_addr.p9050 = addr0;
-	card->irq = irq;
-	card->bus_index = 1;
-	card->first_line = cy_next_channel;
-	card->nports = nchan;
-	retval = cy_init_card(card);
-	if (retval)
-		goto err_null;
-
-	pci_set_drvdata(pdev, card);
-
-	if (device_id == PCI_DEVICE_ID_CYCLOM_Y_Lo ||
-			device_id == PCI_DEVICE_ID_CYCLOM_Y_Hi) {
-		/* enable interrupts in the PCI interface */
-		plx_ver = readb(addr2 + CyPLX_VER) & 0x0f;
-		switch (plx_ver) {
-		case PLX_9050:
-			cy_writeb(addr0 + 0x4c, 0x43);
-			break;
-
-		case PLX_9060:
-		case PLX_9080:
-		default:	/* Old boards, use PLX_9060 */
-		{
-			struct RUNTIME_9060 __iomem *ctl_addr = addr0;
-			plx_init(pdev, irq, ctl_addr);
-			cy_writew(&ctl_addr->intr_ctrl_stat,
-				readw(&ctl_addr->intr_ctrl_stat) | 0x0900);
-			break;
-		}
-		}
-	}
-
-	dev_info(&pdev->dev, "%s/PCI #%d found: %d channels starting from "
-		"port %d.\n", card_name, card_no + 1, nchan, cy_next_channel);
-	for (j = 0, i = cy_next_channel; i < cy_next_channel + nchan; i++, j++)
-		tty_port_register_device(&card->ports[j].port,
-				cy_serial_driver, i, &pdev->dev);
-	cy_next_channel += nchan;
-
-	return 0;
-err_null:
-	card->base_addr = NULL;
-	free_irq(irq, card);
-err_unmap:
-	iounmap(addr0);
-	if (addr2)
-		iounmap(addr2);
-err_reg:
-	pci_release_regions(pdev);
-err_dis:
-	pci_disable_device(pdev);
-err:
-	return retval;
-}
-
-static void cy_pci_remove(struct pci_dev *pdev)
-{
-	struct cyclades_card *cinfo = pci_get_drvdata(pdev);
-	unsigned int i, channel;
-
-	/* non-Z with old PLX */
-	if (!cy_is_Z(cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
-			PLX_9050)
-		cy_writeb(cinfo->ctl_addr.p9050 + 0x4c, 0);
-	else
-#ifndef CONFIG_CYZ_INTR
-		if (!cy_is_Z(cinfo))
-#endif
-		cy_writew(&cinfo->ctl_addr.p9060->intr_ctrl_stat,
-			readw(&cinfo->ctl_addr.p9060->intr_ctrl_stat) &
-			~0x0900);
-
-	iounmap(cinfo->base_addr);
-	if (cinfo->ctl_addr.p9050)
-		iounmap(cinfo->ctl_addr.p9050);
-	if (cinfo->irq
-#ifndef CONFIG_CYZ_INTR
-		&& !cy_is_Z(cinfo)
-#endif /* CONFIG_CYZ_INTR */
-		)
-		free_irq(cinfo->irq, cinfo);
-	pci_release_regions(pdev);
-
-	cinfo->base_addr = NULL;
-	for (channel = 0, i = cinfo->first_line; i < cinfo->first_line +
-			cinfo->nports; i++, channel++) {
-		tty_unregister_device(cy_serial_driver, i);
-		tty_port_destroy(&cinfo->ports[channel].port);
-	}
-	cinfo->nports = 0;
-	kfree(cinfo->ports);
-}
-
-static struct pci_driver cy_pci_driver = {
-	.name = "cyclades",
-	.id_table = cy_pci_dev_id,
-	.probe = cy_pci_probe,
-	.remove = cy_pci_remove
-};
-#endif
-
-static int cyclades_proc_show(struct seq_file *m, void *v)
-{
-	struct cyclades_port *info;
-	unsigned int i, j;
-	__u32 cur_jifs = jiffies;
-
-	seq_puts(m, "Dev TimeOpen   BytesOut  IdleOut    BytesIn   "
-			"IdleIn  Overruns  Ldisc\n");
-
-	/* Output one line for each known port */
-	for (i = 0; i < NR_CARDS; i++)
-		for (j = 0; j < cy_card[i].nports; j++) {
-			info = &cy_card[i].ports[j];
-
-			if (info->port.count) {
-				/* XXX is the ldisc num worth this? */
-				struct tty_struct *tty;
-				struct tty_ldisc *ld;
-				int num = 0;
-				tty = tty_port_tty_get(&info->port);
-				if (tty) {
-					ld = tty_ldisc_ref(tty);
-					if (ld) {
-						num = ld->ops->num;
-						tty_ldisc_deref(ld);
-					}
-					tty_kref_put(tty);
-				}
-				seq_printf(m, "%3d %8lu %10lu %8lu "
-					"%10lu %8lu %9lu %6d\n", info->line,
-					(cur_jifs - info->idle_stats.in_use) /
-					HZ, info->idle_stats.xmit_bytes,
-					(cur_jifs - info->idle_stats.xmit_idle)/
-					HZ, info->idle_stats.recv_bytes,
-					(cur_jifs - info->idle_stats.recv_idle)/
-					HZ, info->idle_stats.overruns,
-					num);
-			} else
-				seq_printf(m, "%3d %8lu %10lu %8lu "
-					"%10lu %8lu %9lu %6ld\n",
-					info->line, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
-		}
-	return 0;
-}
-
-/* The serial driver boot-time initialization code!
-    Hardware I/O ports are mapped to character special devices on a
-    first found, first allocated manner.  That is, this code searches
-    for Cyclom cards in the system.  As each is found, it is probed
-    to discover how many chips (and thus how many ports) are present.
-    These ports are mapped to the tty ports 32 and upward in monotonic
-    fashion.  If an 8-port card is replaced with a 16-port card, the
-    port mapping on a following card will shift.
-
-    This approach is different from what is used in the other serial
-    device driver because the Cyclom is more properly a multiplexer,
-    not just an aggregation of serial ports on one card.
-
-    If there are more cards with more ports than have been
-    statically allocated above, a warning is printed and the
-    extra ports are ignored.
- */
-
-static const struct tty_operations cy_ops = {
-	.open = cy_open,
-	.close = cy_close,
-	.write = cy_write,
-	.put_char = cy_put_char,
-	.flush_chars = cy_flush_chars,
-	.write_room = cy_write_room,
-	.chars_in_buffer = cy_chars_in_buffer,
-	.flush_buffer = cy_flush_buffer,
-	.ioctl = cy_ioctl,
-	.throttle = cy_throttle,
-	.unthrottle = cy_unthrottle,
-	.set_termios = cy_set_termios,
-	.stop = cy_stop,
-	.start = cy_start,
-	.hangup = cy_hangup,
-	.break_ctl = cy_break,
-	.wait_until_sent = cy_wait_until_sent,
-	.tiocmget = cy_tiocmget,
-	.tiocmset = cy_tiocmset,
-	.get_icount = cy_get_icount,
-	.set_serial = cy_set_serial_info,
-	.get_serial = cy_get_serial_info,
-	.proc_show = cyclades_proc_show,
-};
-
-static int __init cy_init(void)
-{
-	unsigned int nboards;
-	int retval = -ENOMEM;
-
-	cy_serial_driver = alloc_tty_driver(NR_PORTS);
-	if (!cy_serial_driver)
-		goto err;
-
-	printk(KERN_INFO "Cyclades driver " CY_VERSION "\n");
-
-	/* Initialize the tty_driver structure */
-
-	cy_serial_driver->driver_name = "cyclades";
-	cy_serial_driver->name = "ttyC";
-	cy_serial_driver->major = CYCLADES_MAJOR;
-	cy_serial_driver->minor_start = 0;
-	cy_serial_driver->type = TTY_DRIVER_TYPE_SERIAL;
-	cy_serial_driver->subtype = SERIAL_TYPE_NORMAL;
-	cy_serial_driver->init_termios = tty_std_termios;
-	cy_serial_driver->init_termios.c_cflag =
-	    B9600 | CS8 | CREAD | HUPCL | CLOCAL;
-	cy_serial_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
-	tty_set_operations(cy_serial_driver, &cy_ops);
-
-	retval = tty_register_driver(cy_serial_driver);
-	if (retval) {
-		printk(KERN_ERR "Couldn't register Cyclades serial driver\n");
-		goto err_frtty;
-	}
-
-	/* the code below is responsible to find the boards. Each different
-	   type of board has its own detection routine. If a board is found,
-	   the next cy_card structure available is set by the detection
-	   routine. These functions are responsible for checking the
-	   availability of cy_card and cy_port data structures and updating
-	   the cy_next_channel. */
-
-	/* look for isa boards */
-	nboards = cy_detect_isa();
-
-#ifdef CONFIG_PCI
-	/* look for pci boards */
-	retval = pci_register_driver(&cy_pci_driver);
-	if (retval && !nboards) {
-		tty_unregister_driver(cy_serial_driver);
-		goto err_frtty;
-	}
-#endif
-
-	return 0;
-err_frtty:
-	put_tty_driver(cy_serial_driver);
-err:
-	return retval;
-}				/* cy_init */
-
-static void __exit cy_cleanup_module(void)
-{
-	struct cyclades_card *card;
-	unsigned int i, e1;
-
-#ifndef CONFIG_CYZ_INTR
-	del_timer_sync(&cyz_timerlist);
-#endif /* CONFIG_CYZ_INTR */
-
-	e1 = tty_unregister_driver(cy_serial_driver);
-	if (e1)
-		printk(KERN_ERR "failed to unregister Cyclades serial "
-				"driver(%d)\n", e1);
-
-#ifdef CONFIG_PCI
-	pci_unregister_driver(&cy_pci_driver);
-#endif
-
-	for (i = 0; i < NR_CARDS; i++) {
-		card = &cy_card[i];
-		if (card->base_addr) {
-			/* clear interrupt */
-			cy_writeb(card->base_addr + Cy_ClrIntr, 0);
-			iounmap(card->base_addr);
-			if (card->ctl_addr.p9050)
-				iounmap(card->ctl_addr.p9050);
-			if (card->irq
-#ifndef CONFIG_CYZ_INTR
-				&& !cy_is_Z(card)
-#endif /* CONFIG_CYZ_INTR */
-				)
-				free_irq(card->irq, card);
-			for (e1 = card->first_line; e1 < card->first_line +
-					card->nports; e1++)
-				tty_unregister_device(cy_serial_driver, e1);
-			kfree(card->ports);
-		}
-	}
-
-	put_tty_driver(cy_serial_driver);
-} /* cy_cleanup_module */
-
-module_init(cy_init);
-module_exit(cy_cleanup_module);
-
-MODULE_LICENSE("GPL");
-MODULE_VERSION(CY_VERSION);
-MODULE_ALIAS_CHARDEV_MAJOR(CYCLADES_MAJOR);
-MODULE_FIRMWARE("cyzfirm.bin");
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index 603137da4736..7ec05fdb1fc3 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -15,8 +15,7 @@ config SERIAL_8250
 	  here are those that are setting up dedicated Ethernet WWW/FTP
 	  servers, or users that have one of the various bus mice instead of a
 	  serial mouse and don't intend to use their machine's standard serial
-	  port for anything.  (Note that the Cyclades multi serial port driver
-	  does not need this driver built in for it to work.)
+	  port for anything.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called 8250.
@@ -226,7 +225,7 @@ config SERIAL_8250_MANY_PORTS
 	  serial port hardware which acts similar to standard serial port
 	  hardware. If you only use the standard COM 1/2/3/4 ports, you can
 	  say N here to save some memory. You can also say Y if you have an
-	  "intelligent" multiport card such as Cyclades, Digiboards, etc.
+	  "intelligent" multiport card such as Digiboards, etc.
 
 #
 # Multi-port serial cards
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
deleted file mode 100644
index 05ee0f19448a..000000000000
--- a/include/linux/cyclades.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* $Revision: 3.0 $$Date: 1998/11/02 14:20:59 $
- * linux/include/linux/cyclades.h
- *
- * This file was initially written by
- * Randolph Bentson <bentson@grieg.seaslug.org> and is maintained by
- * Ivan Passos <ivan@cyclades.com>.
- *
- * This file contains the general definitions for the cyclades.c driver
- *$Log: cyclades.h,v $
- *Revision 3.1  2002/01/29 11:36:16  henrique
- *added throttle field on struct cyclades_port to indicate whether the
- *port is throttled or not
- *
- *Revision 3.1  2000/04/19 18:52:52  ivan
- *converted address fields to unsigned long and added fields for physical
- *addresses on cyclades_card structure;
- *
- *Revision 3.0  1998/11/02 14:20:59  ivan
- *added nports field on cyclades_card structure;
- *
- *Revision 2.5  1998/08/03 16:57:01  ivan
- *added cyclades_idle_stats structure;
- * 
- *Revision 2.4  1998/06/01 12:09:53  ivan
- *removed closing_wait2 from cyclades_port structure;
- *
- *Revision 2.3  1998/03/16 18:01:12  ivan
- *changes in the cyclades_port structure to get it closer to the 
- *standard serial port structure;
- *added constants for new ioctls;
- *
- *Revision 2.2  1998/02/17 16:50:00  ivan
- *changes in the cyclades_port structure (addition of shutdown_wait and 
- *chip_rev variables);
- *added constants for new ioctls and for CD1400 rev. numbers.
- *
- *Revision 2.1	1997/10/24 16:03:00  ivan
- *added rflow (which allows enabling the CD1400 special flow control 
- *feature) and rtsdtr_inv (which allows DTR/RTS pin inversion) to 
- *cyclades_port structure;
- *added Alpha support
- *
- *Revision 2.0  1997/06/30 10:30:00  ivan
- *added some new doorbell command constants related to IOCTLW and
- *UART error signaling
- *
- *Revision 1.8  1997/06/03 15:30:00  ivan
- *added constant ZFIRM_HLT
- *added constant CyPCI_Ze_win ( = 2 * Cy_PCI_Zwin)
- *
- *Revision 1.7  1997/03/26 10:30:00  daniel
- *new entries at the end of cyclades_port struct to reallocate
- *variables illegally allocated within card memory.
- *
- *Revision 1.6  1996/09/09 18:35:30  bentson
- *fold in changes for Cyclom-Z -- including structures for
- *communicating with board as well modest changes to original
- *structures to support new features.
- *
- *Revision 1.5  1995/11/13 21:13:31  bentson
- *changes suggested by Michael Chastain <mec@duracef.shout.net>
- *to support use of this file in non-kernel applications
- *
- *
- */
-#ifndef _LINUX_CYCLADES_H
-#define _LINUX_CYCLADES_H
-
-#include <uapi/linux/cyclades.h>
-
-
-/* Per card data structure */
-struct cyclades_card {
-	void __iomem *base_addr;
-	union {
-		void __iomem *p9050;
-		struct RUNTIME_9060 __iomem *p9060;
-	} ctl_addr;
-	struct BOARD_CTRL __iomem *board_ctrl;	/* cyz specific */
-	int irq;
-	unsigned int num_chips;	/* 0 if card absent, -1 if Z/PCI, else Y */
-	unsigned int first_line;	/* minor number of first channel on card */
-	unsigned int nports;	/* Number of ports in the card */
-	int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
-	int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
-	u32 hw_ver;
-	spinlock_t card_lock;
-	struct cyclades_port *ports;
-};
-
-/***************************************
- * Memory access functions/macros      *
- * (required to support Alpha systems) *
- ***************************************/
-
-#define cy_writeb(port,val)     do { writeb((val), (port)); mb(); } while (0)
-#define cy_writew(port,val)     do { writew((val), (port)); mb(); } while (0)
-#define cy_writel(port,val)     do { writel((val), (port)); mb(); } while (0)
-
-/*
- * Statistics counters
- */
-struct cyclades_icount {
-	__u32	cts, dsr, rng, dcd, tx, rx;
-	__u32	frame, parity, overrun, brk;
-	__u32	buf_overrun;
-};
-
-/*
- * This is our internal structure for each serial port's state.
- * 
- * Many fields are paralleled by the structure used by the serial_struct
- * structure.
- *
- * For definitions of the flags field, see tty.h
- */
-
-struct cyclades_port {
-	int                     magic;
-	struct tty_port		port;
-	struct cyclades_card	*card;
-	union {
-		struct {
-			void __iomem *base_addr;
-		} cyy;
-		struct {
-			struct CH_CTRL __iomem	*ch_ctrl;
-			struct BUF_CTRL __iomem	*buf_ctrl;
-		} cyz;
-	} u;
-	int			line;
-	int			flags; 		/* defined in tty.h */
-	int                     type;		/* UART type */
-	int			read_status_mask;
-	int			ignore_status_mask;
-	int			timeout;
-	int			xmit_fifo_size;
-	int                     cor1,cor2,cor3,cor4,cor5;
-	int                     tbpr,tco,rbpr,rco;
-	int			baud;
-	int			rflow;
-	int			rtsdtr_inv;
-	int			chip_rev;
-	int			custom_divisor;
-	u8			x_char; /* to be pushed out ASAP */
-	int                     breakon;
-	int                     breakoff;
-	int			xmit_head;
-	int			xmit_tail;
-	int			xmit_cnt;
-        int                     default_threshold;
-        int                     default_timeout;
-	unsigned long		rflush_count;
-	struct cyclades_monitor	mon;
-	struct cyclades_idle_stats	idle_stats;
-	struct cyclades_icount	icount;
-	struct completion       shutdown_wait;
-	int throttle;
-#ifdef CONFIG_CYZ_INTR
-	struct timer_list	rx_full_timer;
-#endif
-};
-
-#define	CLOSING_WAIT_DELAY	30*HZ
-#define CY_CLOSING_WAIT_NONE	ASYNC_CLOSING_WAIT_NONE
-#define CY_CLOSING_WAIT_INF	ASYNC_CLOSING_WAIT_INF
-
-
-#define CyMAX_CHIPS_PER_CARD	8
-#define CyMAX_CHAR_FIFO		12
-#define CyPORTS_PER_CHIP	4
-#define	CD1400_MAX_SPEED	115200
-
-#define	CyISA_Ywin	0x2000
-
-#define CyPCI_Ywin 	0x4000
-#define CyPCI_Yctl 	0x80
-#define CyPCI_Zctl 	CTRL_WINDOW_SIZE
-#define CyPCI_Zwin 	0x80000
-#define CyPCI_Ze_win 	(2 * CyPCI_Zwin)
-
-#define PCI_DEVICE_ID_MASK	0x06
-
-/**** CD1400 registers ****/
-
-#define CD1400_REV_G	0x46
-#define CD1400_REV_J	0x48
-
-#define CyRegSize  	0x0400
-#define Cy_HwReset 	0x1400
-#define Cy_ClrIntr 	0x1800
-#define Cy_EpldRev 	0x1e00
-
-/* Global Registers */
-
-#define CyGFRCR		(0x40*2)
-#define      CyRevE		(44)
-#define CyCAR		(0x68*2)
-#define      CyCHAN_0		(0x00)
-#define      CyCHAN_1		(0x01)
-#define      CyCHAN_2		(0x02)
-#define      CyCHAN_3		(0x03)
-#define CyGCR		(0x4B*2)
-#define      CyCH0_SERIAL	(0x00)
-#define      CyCH0_PARALLEL	(0x80)
-#define CySVRR		(0x67*2)
-#define      CySRModem		(0x04)
-#define      CySRTransmit	(0x02)
-#define      CySRReceive	(0x01)
-#define CyRICR		(0x44*2)
-#define CyTICR		(0x45*2)
-#define CyMICR		(0x46*2)
-#define      CyICR0		(0x00)
-#define      CyICR1		(0x01)
-#define      CyICR2		(0x02)
-#define      CyICR3		(0x03)
-#define CyRIR		(0x6B*2)
-#define CyTIR		(0x6A*2)
-#define CyMIR		(0x69*2)
-#define      CyIRDirEq		(0x80)
-#define      CyIRBusy		(0x40)
-#define      CyIRUnfair		(0x20)
-#define      CyIRContext	(0x1C)
-#define      CyIRChannel	(0x03)
-#define CyPPR 		(0x7E*2)
-#define      CyCLOCK_20_1MS	(0x27)
-#define      CyCLOCK_25_1MS	(0x31)
-#define      CyCLOCK_25_5MS	(0xf4)
-#define      CyCLOCK_60_1MS	(0x75)
-#define      CyCLOCK_60_2MS	(0xea)
-
-/* Virtual Registers */
-
-#define CyRIVR		(0x43*2)
-#define CyTIVR		(0x42*2)
-#define CyMIVR		(0x41*2)
-#define      CyIVRMask (0x07)
-#define      CyIVRRxEx (0x07)
-#define      CyIVRRxOK (0x03)
-#define      CyIVRTxOK (0x02)
-#define      CyIVRMdmOK (0x01)
-#define CyTDR		(0x63*2)
-#define CyRDSR		(0x62*2)
-#define      CyTIMEOUT		(0x80)
-#define      CySPECHAR		(0x70)
-#define      CyBREAK		(0x08)
-#define      CyPARITY		(0x04)
-#define      CyFRAME		(0x02)
-#define      CyOVERRUN		(0x01)
-#define CyMISR		(0x4C*2)
-/* see CyMCOR_ and CyMSVR_ for bits*/
-#define CyEOSRR		(0x60*2)
-
-/* Channel Registers */
-
-#define CyLIVR		(0x18*2)
-#define      CyMscsr		(0x01)
-#define      CyTdsr		(0x02)
-#define      CyRgdsr		(0x03)
-#define      CyRedsr		(0x07)
-#define CyCCR		(0x05*2)
-/* Format 1 */
-#define      CyCHAN_RESET	(0x80)
-#define      CyCHIP_RESET	(0x81)
-#define      CyFlushTransFIFO	(0x82)
-/* Format 2 */
-#define      CyCOR_CHANGE	(0x40)
-#define      CyCOR1ch		(0x02)
-#define      CyCOR2ch		(0x04)
-#define      CyCOR3ch		(0x08)
-/* Format 3 */
-#define      CySEND_SPEC_1	(0x21)
-#define      CySEND_SPEC_2	(0x22)
-#define      CySEND_SPEC_3	(0x23)
-#define      CySEND_SPEC_4	(0x24)
-/* Format 4 */
-#define      CyCHAN_CTL		(0x10)
-#define      CyDIS_RCVR		(0x01)
-#define      CyENB_RCVR		(0x02)
-#define      CyDIS_XMTR		(0x04)
-#define      CyENB_XMTR		(0x08)
-#define CySRER		(0x06*2)
-#define      CyMdmCh		(0x80)
-#define      CyRxData		(0x10)
-#define      CyTxRdy		(0x04)
-#define      CyTxMpty		(0x02)
-#define      CyNNDT		(0x01)
-#define CyCOR1		(0x08*2)
-#define      CyPARITY_NONE	(0x00)
-#define      CyPARITY_0		(0x20)
-#define      CyPARITY_1		(0xA0)
-#define      CyPARITY_E		(0x40)
-#define      CyPARITY_O		(0xC0)
-#define      Cy_1_STOP		(0x00)
-#define      Cy_1_5_STOP	(0x04)
-#define      Cy_2_STOP		(0x08)
-#define      Cy_5_BITS		(0x00)
-#define      Cy_6_BITS		(0x01)
-#define      Cy_7_BITS		(0x02)
-#define      Cy_8_BITS		(0x03)
-#define CyCOR2		(0x09*2)
-#define      CyIXM		(0x80)
-#define      CyTxIBE		(0x40)
-#define      CyETC		(0x20)
-#define      CyAUTO_TXFL	(0x60)
-#define      CyLLM		(0x10)
-#define      CyRLM		(0x08)
-#define      CyRtsAO		(0x04)
-#define      CyCtsAE		(0x02)
-#define      CyDsrAE		(0x01)
-#define CyCOR3		(0x0A*2)
-#define      CySPL_CH_DRANGE	(0x80)  /* special character detect range */
-#define      CySPL_CH_DET1	(0x40)  /* enable special character detection
-                                                               on SCHR4-SCHR3 */
-#define      CyFL_CTRL_TRNSP	(0x20)  /* Flow Control Transparency */
-#define      CySPL_CH_DET2	(0x10)  /* Enable special character detection
-                                                               on SCHR2-SCHR1 */
-#define      CyREC_FIFO		(0x0F)  /* Receive FIFO threshold */
-#define CyCOR4		(0x1E*2)
-#define CyCOR5		(0x1F*2)
-#define CyCCSR		(0x0B*2)
-#define      CyRxEN		(0x80)
-#define      CyRxFloff		(0x40)
-#define      CyRxFlon		(0x20)
-#define      CyTxEN		(0x08)
-#define      CyTxFloff		(0x04)
-#define      CyTxFlon		(0x02)
-#define CyRDCR		(0x0E*2)
-#define CySCHR1		(0x1A*2)
-#define CySCHR2 	(0x1B*2)
-#define CySCHR3		(0x1C*2)
-#define CySCHR4		(0x1D*2)
-#define CySCRL		(0x22*2)
-#define CySCRH		(0x23*2)
-#define CyLNC		(0x24*2)
-#define CyMCOR1 	(0x15*2)
-#define CyMCOR2		(0x16*2)
-#define CyRTPR		(0x21*2)
-#define CyMSVR1		(0x6C*2)
-#define CyMSVR2		(0x6D*2)
-#define      CyANY_DELTA	(0xF0)
-#define      CyDSR		(0x80)
-#define      CyCTS		(0x40)
-#define      CyRI		(0x20)
-#define      CyDCD		(0x10)
-#define      CyDTR              (0x02)
-#define      CyRTS              (0x01)
-#define CyPVSR		(0x6F*2)
-#define CyRBPR		(0x78*2)
-#define CyRCOR		(0x7C*2)
-#define CyTBPR		(0x72*2)
-#define CyTCOR		(0x76*2)
-
-/* Custom Registers */
-
-#define	CyPLX_VER	(0x3400)
-#define	PLX_9050	0x0b
-#define	PLX_9060	0x0c
-#define	PLX_9080	0x0d
-
-/***************************************************************************/
-
-#endif /* _LINUX_CYCLADES_H */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 8a18517696c1..056d2074f07a 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1711,14 +1711,6 @@
 #define PCI_DEVICE_ID_CRP16INTF		0x0903
 
 #define PCI_VENDOR_ID_CYCLADES		0x120e
-#define PCI_DEVICE_ID_CYCLOM_Y_Lo	0x0100
-#define PCI_DEVICE_ID_CYCLOM_Y_Hi	0x0101
-#define PCI_DEVICE_ID_CYCLOM_4Y_Lo	0x0102
-#define PCI_DEVICE_ID_CYCLOM_4Y_Hi	0x0103
-#define PCI_DEVICE_ID_CYCLOM_8Y_Lo	0x0104
-#define PCI_DEVICE_ID_CYCLOM_8Y_Hi	0x0105
-#define PCI_DEVICE_ID_CYCLOM_Z_Lo	0x0200
-#define PCI_DEVICE_ID_CYCLOM_Z_Hi	0x0201
 #define PCI_DEVICE_ID_PC300_RX_2	0x0300
 #define PCI_DEVICE_ID_PC300_RX_1	0x0301
 #define PCI_DEVICE_ID_PC300_TE_2	0x0310
diff --git a/include/uapi/linux/cyclades.h b/include/uapi/linux/cyclades.h
deleted file mode 100644
index fc0add2194a9..000000000000
--- a/include/uapi/linux/cyclades.h
+++ /dev/null
@@ -1,494 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/* $Revision: 3.0 $$Date: 1998/11/02 14:20:59 $
- * linux/include/linux/cyclades.h
- *
- * This file was initially written by
- * Randolph Bentson <bentson@grieg.seaslug.org> and is maintained by
- * Ivan Passos <ivan@cyclades.com>.
- *
- * This file contains the general definitions for the cyclades.c driver
- *$Log: cyclades.h,v $
- *Revision 3.1  2002/01/29 11:36:16  henrique
- *added throttle field on struct cyclades_port to indicate whether the
- *port is throttled or not
- *
- *Revision 3.1  2000/04/19 18:52:52  ivan
- *converted address fields to unsigned long and added fields for physical
- *addresses on cyclades_card structure;
- *
- *Revision 3.0  1998/11/02 14:20:59  ivan
- *added nports field on cyclades_card structure;
- *
- *Revision 2.5  1998/08/03 16:57:01  ivan
- *added cyclades_idle_stats structure;
- * 
- *Revision 2.4  1998/06/01 12:09:53  ivan
- *removed closing_wait2 from cyclades_port structure;
- *
- *Revision 2.3  1998/03/16 18:01:12  ivan
- *changes in the cyclades_port structure to get it closer to the 
- *standard serial port structure;
- *added constants for new ioctls;
- *
- *Revision 2.2  1998/02/17 16:50:00  ivan
- *changes in the cyclades_port structure (addition of shutdown_wait and 
- *chip_rev variables);
- *added constants for new ioctls and for CD1400 rev. numbers.
- *
- *Revision 2.1	1997/10/24 16:03:00  ivan
- *added rflow (which allows enabling the CD1400 special flow control 
- *feature) and rtsdtr_inv (which allows DTR/RTS pin inversion) to 
- *cyclades_port structure;
- *added Alpha support
- *
- *Revision 2.0  1997/06/30 10:30:00  ivan
- *added some new doorbell command constants related to IOCTLW and
- *UART error signaling
- *
- *Revision 1.8  1997/06/03 15:30:00  ivan
- *added constant ZFIRM_HLT
- *added constant CyPCI_Ze_win ( = 2 * Cy_PCI_Zwin)
- *
- *Revision 1.7  1997/03/26 10:30:00  daniel
- *new entries at the end of cyclades_port struct to reallocate
- *variables illegally allocated within card memory.
- *
- *Revision 1.6  1996/09/09 18:35:30  bentson
- *fold in changes for Cyclom-Z -- including structures for
- *communicating with board as well modest changes to original
- *structures to support new features.
- *
- *Revision 1.5  1995/11/13 21:13:31  bentson
- *changes suggested by Michael Chastain <mec@duracef.shout.net>
- *to support use of this file in non-kernel applications
- *
- *
- */
-
-#ifndef _UAPI_LINUX_CYCLADES_H
-#define _UAPI_LINUX_CYCLADES_H
-
-#include <linux/types.h>
-
-struct cyclades_monitor {
-        unsigned long           int_count;
-        unsigned long           char_count;
-        unsigned long           char_max;
-        unsigned long           char_last;
-};
-
-/*
- * These stats all reflect activity since the device was last initialized.
- * (i.e., since the port was opened with no other processes already having it
- * open)
- */
-struct cyclades_idle_stats {
-    __kernel_old_time_t in_use;	/* Time device has been in use (secs) */
-    __kernel_old_time_t recv_idle; /* Time since last char received (secs) */
-    __kernel_old_time_t xmit_idle; /* Time since last char transmitted (secs) */
-    unsigned long  recv_bytes;	/* Bytes received */
-    unsigned long  xmit_bytes;	/* Bytes transmitted */
-    unsigned long  overruns;	/* Input overruns */
-    unsigned long  frame_errs;	/* Input framing errors */
-    unsigned long  parity_errs;	/* Input parity errors */
-};
-
-#define CYCLADES_MAGIC  0x4359
-
-#define CYGETMON                0x435901
-#define CYGETTHRESH             0x435902
-#define CYSETTHRESH             0x435903
-#define CYGETDEFTHRESH          0x435904
-#define CYSETDEFTHRESH          0x435905
-#define CYGETTIMEOUT            0x435906
-#define CYSETTIMEOUT            0x435907
-#define CYGETDEFTIMEOUT         0x435908
-#define CYSETDEFTIMEOUT         0x435909
-#define CYSETRFLOW		0x43590a
-#define CYGETRFLOW		0x43590b
-#define CYSETRTSDTR_INV		0x43590c
-#define CYGETRTSDTR_INV		0x43590d
-#define CYZSETPOLLCYCLE		0x43590e
-#define CYZGETPOLLCYCLE		0x43590f
-#define CYGETCD1400VER		0x435910
-#define	CYSETWAIT		0x435912
-#define	CYGETWAIT		0x435913
-
-/*************** CYCLOM-Z ADDITIONS ***************/
-
-#define CZIOC           ('M' << 8)
-#define CZ_NBOARDS      (CZIOC|0xfa)
-#define CZ_BOOT_START   (CZIOC|0xfb)
-#define CZ_BOOT_DATA    (CZIOC|0xfc)
-#define CZ_BOOT_END     (CZIOC|0xfd)
-#define CZ_TEST         (CZIOC|0xfe)
-
-#define CZ_DEF_POLL	(HZ/25)
-
-#define MAX_BOARD       4       /* Max number of boards */
-#define MAX_DEV         256     /* Max number of ports total */
-#define	CYZ_MAX_SPEED	921600
-
-#define	CYZ_FIFO_SIZE	16
-
-#define CYZ_BOOT_NWORDS 0x100
-struct CYZ_BOOT_CTRL {
-        unsigned short  nboard;
-        int             status[MAX_BOARD];
-        int             nchannel[MAX_BOARD];
-        int             fw_rev[MAX_BOARD];
-        unsigned long   offset;
-        unsigned long   data[CYZ_BOOT_NWORDS];
-};
-
-
-#ifndef DP_WINDOW_SIZE
-/*
- *	Memory Window Sizes
- */
-
-#define	DP_WINDOW_SIZE		(0x00080000)	/* window size 512 Kb */
-#define	ZE_DP_WINDOW_SIZE	(0x00100000)	/* window size 1 Mb (Ze and
-						  8Zo V.2 */
-#define	CTRL_WINDOW_SIZE	(0x00000080)	/* runtime regs 128 bytes */
-
-/*
- *	CUSTOM_REG - Cyclom-Z/PCI Custom Registers Set. The driver
- *	normally will access only interested on the fpga_id, fpga_version,
- *	start_cpu and stop_cpu.
- */
-
-struct	CUSTOM_REG {
-	__u32	fpga_id;		/* FPGA Identification Register */
-	__u32	fpga_version;		/* FPGA Version Number Register */
-	__u32	cpu_start;		/* CPU start Register (write) */
-	__u32	cpu_stop;		/* CPU stop Register (write) */
-	__u32	misc_reg;		/* Miscellaneous Register */
-	__u32	idt_mode;		/* IDT mode Register */
-	__u32	uart_irq_status;	/* UART IRQ status Register */
-	__u32	clear_timer0_irq;	/* Clear timer interrupt Register */
-	__u32	clear_timer1_irq;	/* Clear timer interrupt Register */
-	__u32	clear_timer2_irq;	/* Clear timer interrupt Register */
-	__u32	test_register;		/* Test Register */
-	__u32	test_count;		/* Test Count Register */
-	__u32	timer_select;		/* Timer select register */
-	__u32	pr_uart_irq_status;	/* Prioritized UART IRQ stat Reg */
-	__u32	ram_wait_state;		/* RAM wait-state Register */
-	__u32	uart_wait_state;	/* UART wait-state Register */
-	__u32	timer_wait_state;	/* timer wait-state Register */
-	__u32	ack_wait_state;		/* ACK wait State Register */
-};
-
-/*
- *	RUNTIME_9060 - PLX PCI9060ES local configuration and shared runtime
- *	registers. This structure can be used to access the 9060 registers
- *	(memory mapped).
- */
-
-struct RUNTIME_9060 {
-	__u32	loc_addr_range;	/* 00h - Local Address Range */
-	__u32	loc_addr_base;	/* 04h - Local Address Base */
-	__u32	loc_arbitr;	/* 08h - Local Arbitration */
-	__u32	endian_descr;	/* 0Ch - Big/Little Endian Descriptor */
-	__u32	loc_rom_range;	/* 10h - Local ROM Range */
-	__u32	loc_rom_base;	/* 14h - Local ROM Base */
-	__u32	loc_bus_descr;	/* 18h - Local Bus descriptor */
-	__u32	loc_range_mst;	/* 1Ch - Local Range for Master to PCI */
-	__u32	loc_base_mst;	/* 20h - Local Base for Master PCI */
-	__u32	loc_range_io;	/* 24h - Local Range for Master IO */
-	__u32	pci_base_mst;	/* 28h - PCI Base for Master PCI */
-	__u32	pci_conf_io;	/* 2Ch - PCI configuration for Master IO */
-	__u32	filler1;	/* 30h */
-	__u32	filler2;	/* 34h */
-	__u32	filler3;	/* 38h */
-	__u32	filler4;	/* 3Ch */
-	__u32	mail_box_0;	/* 40h - Mail Box 0 */
-	__u32	mail_box_1;	/* 44h - Mail Box 1 */
-	__u32	mail_box_2;	/* 48h - Mail Box 2 */
-	__u32	mail_box_3;	/* 4Ch - Mail Box 3 */
-	__u32	filler5;	/* 50h */
-	__u32	filler6;	/* 54h */
-	__u32	filler7;	/* 58h */
-	__u32	filler8;	/* 5Ch */
-	__u32	pci_doorbell;	/* 60h - PCI to Local Doorbell */
-	__u32	loc_doorbell;	/* 64h - Local to PCI Doorbell */
-	__u32	intr_ctrl_stat;	/* 68h - Interrupt Control/Status */
-	__u32	init_ctrl;	/* 6Ch - EEPROM control, Init Control, etc */
-};
-
-/* Values for the Local Base Address re-map register */
-
-#define	WIN_RAM		0x00000001L	/* set the sliding window to RAM */
-#define	WIN_CREG	0x14000001L	/* set the window to custom Registers */
-
-/* Values timer select registers */
-
-#define	TIMER_BY_1M	0x00		/* clock divided by 1M */
-#define	TIMER_BY_256K	0x01		/* clock divided by 256k */
-#define	TIMER_BY_128K	0x02		/* clock divided by 128k */
-#define	TIMER_BY_32K	0x03		/* clock divided by 32k */
-
-/****************** ****************** *******************/
-#endif
-
-#ifndef ZFIRM_ID
-/* #include "zfwint.h" */
-/****************** ****************** *******************/
-/*
- *	This file contains the definitions for interfacing with the
- *	Cyclom-Z ZFIRM Firmware.
- */
-
-/* General Constant definitions */
-
-#define	MAX_CHAN	64		/* max number of channels per board */
-
-/* firmware id structure (set after boot) */
-
-#define ID_ADDRESS	0x00000180L	/* signature/pointer address */
-#define	ZFIRM_ID	0x5557465AL	/* ZFIRM/U signature */
-#define	ZFIRM_HLT	0x59505B5CL	/* ZFIRM needs external power supply */
-#define	ZFIRM_RST	0x56040674L	/* RST signal (due to FW reset) */
-
-#define	ZF_TINACT_DEF	1000		/* default inactivity timeout 
-					   (1000 ms) */
-#define	ZF_TINACT	ZF_TINACT_DEF
-
-struct	FIRM_ID {
-	__u32	signature;		/* ZFIRM/U signature */
-	__u32	zfwctrl_addr;		/* pointer to ZFW_CTRL structure */
-};
-
-/* Op. System id */
-
-#define	C_OS_LINUX	0x00000030	/* generic Linux system */
-
-/* channel op_mode */
-
-#define	C_CH_DISABLE	0x00000000	/* channel is disabled */
-#define	C_CH_TXENABLE	0x00000001	/* channel Tx enabled */
-#define	C_CH_RXENABLE	0x00000002	/* channel Rx enabled */
-#define	C_CH_ENABLE	0x00000003	/* channel Tx/Rx enabled */
-#define	C_CH_LOOPBACK	0x00000004	/* Loopback mode */
-
-/* comm_parity - parity */
-
-#define	C_PR_NONE	0x00000000	/* None */
-#define	C_PR_ODD	0x00000001	/* Odd */
-#define C_PR_EVEN	0x00000002	/* Even */
-#define C_PR_MARK	0x00000004	/* Mark */
-#define C_PR_SPACE	0x00000008	/* Space */
-#define C_PR_PARITY	0x000000ff
-
-#define	C_PR_DISCARD	0x00000100	/* discard char with frame/par error */
-#define C_PR_IGNORE	0x00000200	/* ignore frame/par error */
-
-/* comm_data_l - data length and stop bits */
-
-#define C_DL_CS5	0x00000001
-#define C_DL_CS6	0x00000002
-#define C_DL_CS7	0x00000004
-#define C_DL_CS8	0x00000008
-#define	C_DL_CS		0x0000000f
-#define C_DL_1STOP	0x00000010
-#define C_DL_15STOP	0x00000020
-#define C_DL_2STOP	0x00000040
-#define	C_DL_STOP	0x000000f0
-
-/* interrupt enabling/status */
-
-#define	C_IN_DISABLE	0x00000000	/* zero, disable interrupts */
-#define	C_IN_TXBEMPTY	0x00000001	/* tx buffer empty */
-#define	C_IN_TXLOWWM	0x00000002	/* tx buffer below LWM */
-#define	C_IN_RXHIWM	0x00000010	/* rx buffer above HWM */
-#define	C_IN_RXNNDT	0x00000020	/* rx no new data timeout */
-#define	C_IN_MDCD	0x00000100	/* modem DCD change */
-#define	C_IN_MDSR	0x00000200	/* modem DSR change */
-#define	C_IN_MRI	0x00000400	/* modem RI change */
-#define	C_IN_MCTS	0x00000800	/* modem CTS change */
-#define	C_IN_RXBRK	0x00001000	/* Break received */
-#define	C_IN_PR_ERROR	0x00002000	/* parity error */
-#define	C_IN_FR_ERROR	0x00004000	/* frame error */
-#define C_IN_OVR_ERROR  0x00008000      /* overrun error */
-#define C_IN_RXOFL	0x00010000      /* RX buffer overflow */
-#define C_IN_IOCTLW	0x00020000      /* I/O control w/ wait */
-#define C_IN_MRTS	0x00040000	/* modem RTS drop */
-#define C_IN_ICHAR	0x00080000
- 
-/* flow control */
-
-#define	C_FL_OXX	0x00000001	/* output Xon/Xoff flow control */
-#define	C_FL_IXX	0x00000002	/* output Xon/Xoff flow control */
-#define C_FL_OIXANY	0x00000004	/* output Xon/Xoff (any xon) */
-#define	C_FL_SWFLOW	0x0000000f
-
-/* flow status */
-
-#define	C_FS_TXIDLE	0x00000000	/* no Tx data in the buffer or UART */
-#define	C_FS_SENDING	0x00000001	/* UART is sending data */
-#define	C_FS_SWFLOW	0x00000002	/* Tx is stopped by received Xoff */
-
-/* rs_control/rs_status RS-232 signals */
-
-#define C_RS_PARAM	0x80000000	/* Indicates presence of parameter in 
-					   IOCTLM command */
-#define	C_RS_RTS	0x00000001	/* RTS */
-#define	C_RS_DTR	0x00000004	/* DTR */
-#define	C_RS_DCD	0x00000100	/* CD */
-#define	C_RS_DSR	0x00000200	/* DSR */
-#define	C_RS_RI		0x00000400	/* RI */
-#define	C_RS_CTS	0x00000800	/* CTS */
-
-/* commands Host <-> Board */
-
-#define	C_CM_RESET	0x01		/* reset/flush buffers */
-#define	C_CM_IOCTL	0x02		/* re-read CH_CTRL */
-#define	C_CM_IOCTLW	0x03		/* re-read CH_CTRL, intr when done */
-#define	C_CM_IOCTLM	0x04		/* RS-232 outputs change */
-#define	C_CM_SENDXOFF	0x10		/* send Xoff */
-#define	C_CM_SENDXON	0x11		/* send Xon */
-#define C_CM_CLFLOW	0x12		/* Clear flow control (resume) */
-#define	C_CM_SENDBRK	0x41		/* send break */
-#define	C_CM_INTBACK	0x42		/* Interrupt back */
-#define	C_CM_SET_BREAK	0x43		/* Tx break on */
-#define	C_CM_CLR_BREAK	0x44		/* Tx break off */
-#define	C_CM_CMD_DONE	0x45		/* Previous command done */
-#define C_CM_INTBACK2	0x46		/* Alternate Interrupt back */
-#define	C_CM_TINACT	0x51		/* set inactivity detection */
-#define	C_CM_IRQ_ENBL	0x52		/* enable generation of interrupts */
-#define	C_CM_IRQ_DSBL	0x53		/* disable generation of interrupts */
-#define	C_CM_ACK_ENBL	0x54		/* enable acknowledged interrupt mode */
-#define	C_CM_ACK_DSBL	0x55		/* disable acknowledged intr mode */
-#define	C_CM_FLUSH_RX	0x56		/* flushes Rx buffer */
-#define	C_CM_FLUSH_TX	0x57		/* flushes Tx buffer */
-#define C_CM_Q_ENABLE	0x58		/* enables queue access from the 
-					   driver */
-#define C_CM_Q_DISABLE  0x59            /* disables queue access from the 
-					   driver */
-
-#define	C_CM_TXBEMPTY	0x60		/* Tx buffer is empty */
-#define	C_CM_TXLOWWM	0x61		/* Tx buffer low water mark */
-#define	C_CM_RXHIWM	0x62		/* Rx buffer high water mark */
-#define	C_CM_RXNNDT	0x63		/* rx no new data timeout */
-#define	C_CM_TXFEMPTY	0x64
-#define	C_CM_ICHAR	0x65
-#define	C_CM_MDCD	0x70		/* modem DCD change */
-#define	C_CM_MDSR	0x71		/* modem DSR change */
-#define	C_CM_MRI	0x72		/* modem RI change */
-#define	C_CM_MCTS	0x73		/* modem CTS change */
-#define C_CM_MRTS	0x74		/* modem RTS drop */
-#define	C_CM_RXBRK	0x84		/* Break received */
-#define	C_CM_PR_ERROR	0x85		/* Parity error */
-#define	C_CM_FR_ERROR	0x86		/* Frame error */
-#define C_CM_OVR_ERROR  0x87            /* Overrun error */
-#define C_CM_RXOFL	0x88            /* RX buffer overflow */
-#define	C_CM_CMDERROR	0x90		/* command error */
-#define	C_CM_FATAL	0x91		/* fatal error */
-#define	C_CM_HW_RESET	0x92		/* reset board */
-
-/*
- *	CH_CTRL - This per port structure contains all parameters
- *	that control an specific port. It can be seen as the
- *	configuration registers of a "super-serial-controller".
- */
-
-struct CH_CTRL {
-	__u32	op_mode;	/* operation mode */
-	__u32	intr_enable;	/* interrupt masking */
-	__u32	sw_flow;	/* SW flow control */
-	__u32	flow_status;	/* output flow status */
-	__u32	comm_baud;	/* baud rate  - numerically specified */
-	__u32	comm_parity;	/* parity */
-	__u32	comm_data_l;	/* data length/stop */
-	__u32	comm_flags;	/* other flags */
-	__u32	hw_flow;	/* HW flow control */
-	__u32	rs_control;	/* RS-232 outputs */
-	__u32	rs_status;	/* RS-232 inputs */
-	__u32	flow_xon;	/* xon char */
-	__u32	flow_xoff;	/* xoff char */
-	__u32	hw_overflow;	/* hw overflow counter */
-	__u32	sw_overflow;	/* sw overflow counter */
-	__u32	comm_error;	/* frame/parity error counter */
-	__u32 ichar;
-	__u32 filler[7];
-};
-
-
-/*
- *	BUF_CTRL - This per channel structure contains
- *	all Tx and Rx buffer control for a given channel.
- */
-
-struct	BUF_CTRL	{
-	__u32	flag_dma;	/* buffers are in Host memory */
-	__u32	tx_bufaddr;	/* address of the tx buffer */
-	__u32	tx_bufsize;	/* tx buffer size */
-	__u32	tx_threshold;	/* tx low water mark */
-	__u32	tx_get;		/* tail index tx buf */
-	__u32	tx_put;		/* head index tx buf */
-	__u32	rx_bufaddr;	/* address of the rx buffer */
-	__u32	rx_bufsize;	/* rx buffer size */
-	__u32	rx_threshold;	/* rx high water mark */
-	__u32	rx_get;		/* tail index rx buf */
-	__u32	rx_put;		/* head index rx buf */
-	__u32	filler[5];	/* filler to align structures */
-};
-
-/*
- *	BOARD_CTRL - This per board structure contains all global 
- *	control fields related to the board.
- */
-
-struct BOARD_CTRL {
-
-	/* static info provided by the on-board CPU */
-	__u32	n_channel;	/* number of channels */
-	__u32	fw_version;	/* firmware version */
-
-	/* static info provided by the driver */
-	__u32	op_system;	/* op_system id */
-	__u32	dr_version;	/* driver version */
-
-	/* board control area */
-	__u32	inactivity;	/* inactivity control */
-
-	/* host to FW commands */
-	__u32	hcmd_channel;	/* channel number */
-	__u32	hcmd_param;	/* pointer to parameters */
-
-	/* FW to Host commands */
-	__u32	fwcmd_channel;	/* channel number */
-	__u32	fwcmd_param;	/* pointer to parameters */
-	__u32	zf_int_queue_addr; /* offset for INT_QUEUE structure */
-
-	/* filler so the structures are aligned */
-	__u32	filler[6];
-};
-
-/* Host Interrupt Queue */
-
-#define QUEUE_SIZE	(10*MAX_CHAN)
-
-struct	INT_QUEUE {
-	unsigned char	intr_code[QUEUE_SIZE];
-	unsigned long	channel[QUEUE_SIZE];
-	unsigned long	param[QUEUE_SIZE];
-	unsigned long	put;
-	unsigned long	get;
-};
-
-/*
- *	ZFW_CTRL - This is the data structure that includes all other
- *	data structures used by the Firmware.
- */
- 
-struct ZFW_CTRL {
-	struct BOARD_CTRL	board_ctrl;
-	struct CH_CTRL		ch_ctrl[MAX_CHAN];
-	struct BUF_CTRL		buf_ctrl[MAX_CHAN];
-};
-
-/****************** ****************** *******************/
-#endif
-
-#endif /* _UAPI_LINUX_CYCLADES_H */
diff --git a/include/uapi/linux/major.h b/include/uapi/linux/major.h
index 7e5fa8e15c43..4e5f2b3a3d54 100644
--- a/include/uapi/linux/major.h
+++ b/include/uapi/linux/major.h
@@ -34,8 +34,6 @@
 #define GOLDSTAR_CDROM_MAJOR	16
 #define OPTICS_CDROM_MAJOR	17
 #define SANYO_CDROM_MAJOR	18
-#define CYCLADES_MAJOR		19
-#define CYCLADESAUX_MAJOR	20
 #define MITSUMI_X_CDROM_MAJOR	20
 #define MFM_ACORN_MAJOR		21	/* ARM Linux /dev/mfm */
 #define SCSI_GENERIC_MAJOR	21
diff --git a/include/uapi/linux/serial.h b/include/uapi/linux/serial.h
index 93eb3c496ff1..fa6b16e5fdd8 100644
--- a/include/uapi/linux/serial.h
+++ b/include/uapi/linux/serial.h
@@ -52,11 +52,11 @@ struct serial_struct {
 #define PORT_16450	2
 #define PORT_16550	3
 #define PORT_16550A	4
-#define PORT_CIRRUS     5	/* usurped by cyclades.c */
+#define PORT_CIRRUS     5
 #define PORT_16650	6
 #define PORT_16650V2	7
 #define PORT_16750	8
-#define PORT_STARTECH	9	/* usurped by cyclades.c */
+#define PORT_STARTECH	9
 #define PORT_16C950	10	/* Oxford Semiconductor */
 #define PORT_16654	11
 #define PORT_16850	12
-- 
cgit v1.2.3-71-gd317


From e56763ee50a3f28d2f70355ca43fb78d8539a183 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@orcam.me.uk>
Date: Wed, 10 Mar 2021 13:02:42 +0100
Subject: FDDI: if_fddi.h: Update my e-mail address

Following the recent update to MAINTAINERS update my e-mail address.

Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_fddi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_fddi.h b/include/uapi/linux/if_fddi.h
index 7239aa9c0766..8df2d9934bcd 100644
--- a/include/uapi/linux/if_fddi.h
+++ b/include/uapi/linux/if_fddi.h
@@ -9,7 +9,7 @@
  * Version:	@(#)if_fddi.h	1.0.3	Oct  6 2018
  *
  * Author:	Lawrence V. Stefani, <stefani@yahoo.com>
- * Maintainer:	Maciej W. Rozycki, <macro@linux-mips.org>
+ * Maintainer:	Maciej W. Rozycki, <macro@orcam.me.uk>
  *
  *		if_fddi.h is based on previous if_ether.h and if_tr.h work by
  *			Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
-- 
cgit v1.2.3-71-gd317


From bcbe55dc7b41c333970bde85eddfc5ac23e87433 Mon Sep 17 00:00:00 2001
From: Andrzej Pietrasiewicz <andrzej.p@collabora.com>
Date: Thu, 4 Feb 2021 17:16:59 +0100
Subject: media: uapi: Correct doc comment in H264 uAPI

struct v4l2_ctrl_h264_pps members obviously match picture parameter syntax,
not sequence parameter syntax.

Signed-off-by: Andrzej Pietrasiewicz <andrzej.p@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/uapi/linux/v4l2-controls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 039c0d7add1b..61441c250239 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -1329,7 +1329,7 @@ struct v4l2_ctrl_h264_sps {
  * struct v4l2_ctrl_h264_pps - H264 picture parameter set
  *
  * Except where noted, all the members on this picture parameter set
- * structure match the sequence parameter set syntax as specified
+ * structure match the picture parameter set syntax as specified
  * by the H264 specification.
  *
  * In particular, V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT flag
-- 
cgit v1.2.3-71-gd317


From 312e3f8aefb5dc9c2f052ba0ee35a2fd6baa5bcd Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 11 Mar 2021 09:30:54 +0000
Subject: thermal: Fix spelling mistake "disabed" -> "disabled"

There is a spelling mistake in a comment, fix it.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20210311093054.5338-1-colin.king@canonical.com
---
 include/uapi/linux/thermal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h
index c105054cbb57..9aa2fedfa309 100644
--- a/include/uapi/linux/thermal.h
+++ b/include/uapi/linux/thermal.h
@@ -60,7 +60,7 @@ enum thermal_genl_event {
 	THERMAL_GENL_EVENT_UNSPEC,
 	THERMAL_GENL_EVENT_TZ_CREATE,		/* Thermal zone creation */
 	THERMAL_GENL_EVENT_TZ_DELETE,		/* Thermal zone deletion */
-	THERMAL_GENL_EVENT_TZ_DISABLE,		/* Thermal zone disabed */
+	THERMAL_GENL_EVENT_TZ_DISABLE,		/* Thermal zone disabled */
 	THERMAL_GENL_EVENT_TZ_ENABLE,		/* Thermal zone enabled */
 	THERMAL_GENL_EVENT_TZ_TRIP_UP,		/* Trip point crossed the way up */
 	THERMAL_GENL_EVENT_TZ_TRIP_DOWN,	/* Trip point crossed the way down */
-- 
cgit v1.2.3-71-gd317


From f73f7f4da581875f9b1f2fb8ebd1ab15ed634488 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 15 Feb 2021 12:40:39 +0200
Subject: iio: buffer: add ioctl() to support opening extra buffers for IIO
 device

With this change, an ioctl() call is added to open a character device for a
buffer. The ioctl() number is 'i' 0x91, which follows the
IIO_GET_EVENT_FD_IOCTL ioctl.

The ioctl() will return an FD for the requested buffer index. The indexes
are the same from the /sys/iio/devices/iio:deviceX/bufferY (i.e. the Y
variable).

Since there doesn't seem to be a sane way to return the FD for buffer0 to
be the same FD for the /dev/iio:deviceX, this ioctl() will return another
FD for buffer0 (or the first buffer). This duplicate FD will be able to
access the same buffer object (for buffer0) as accessing directly the
/dev/iio:deviceX chardev.

Also, there is no IIO_BUFFER_GET_BUFFER_COUNT ioctl() implemented, as the
index for each buffer (and the count) can be deduced from the
'/sys/bus/iio/devices/iio:deviceX/bufferY' folders (i.e the number of
bufferY folders).

Used following C code to test this:
-------------------------------------------------------------------

 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
 #include <fcntl.h"
 #include <errno.h>

 #define IIO_BUFFER_GET_FD_IOCTL      _IOWR('i', 0x91, int)

int main(int argc, char *argv[])
{
        int fd;
        int fd1;
        int ret;

        if ((fd = open("/dev/iio:device0", O_RDWR))<0) {
                fprintf(stderr, "Error open() %d errno %d\n",fd, errno);
                return -1;
        }

        fprintf(stderr, "Using FD %d\n", fd);

        fd1 = atoi(argv[1]);

        ret = ioctl(fd, IIO_BUFFER_GET_FD_IOCTL, &fd1);
        if (ret < 0) {
                fprintf(stderr, "Error for buffer %d ioctl() %d errno %d\n", fd1, ret, errno);
                close(fd);
                return -1;
        }

        fprintf(stderr, "Got FD %d\n", fd1);

        close(fd1);
        close(fd);

        return 0;
}
-------------------------------------------------------------------

Results are:
-------------------------------------------------------------------
 # ./test 0
 Using FD 3
 Got FD 4

 # ./test 1
 Using FD 3
 Got FD 4

 # ./test 2
 Using FD 3
 Got FD 4

 # ./test 3
 Using FD 3
 Got FD 4

 # ls /sys/bus/iio/devices/iio\:device0
 buffer  buffer0  buffer1  buffer2  buffer3  dev
 in_voltage_sampling_frequency  in_voltage_scale
 in_voltage_scale_available
 name  of_node  power  scan_elements  subsystem  uevent
-------------------------------------------------------------------

iio:device0 has some fake kfifo buffers attached to an IIO device.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210215104043.91251-21-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/iio_core.h            |  12 ++--
 drivers/iio/industrialio-buffer.c | 144 ++++++++++++++++++++++++++++++++++++--
 include/linux/iio/buffer_impl.h   |   5 ++
 include/linux/iio/iio-opaque.h    |   2 +
 include/uapi/linux/iio/buffer.h   |  10 +++
 5 files changed, 162 insertions(+), 11 deletions(-)
 create mode 100644 include/uapi/linux/iio/buffer.h

(limited to 'include/uapi/linux')

diff --git a/drivers/iio/iio_core.h b/drivers/iio/iio_core.h
index 7990c759f1f5..062fe16c6c49 100644
--- a/drivers/iio/iio_core.h
+++ b/drivers/iio/iio_core.h
@@ -64,16 +64,16 @@ ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals);
 #ifdef CONFIG_IIO_BUFFER
 struct poll_table_struct;
 
-__poll_t iio_buffer_poll(struct file *filp,
-			     struct poll_table_struct *wait);
-ssize_t iio_buffer_read_outer(struct file *filp, char __user *buf,
-			      size_t n, loff_t *f_ps);
+__poll_t iio_buffer_poll_wrapper(struct file *filp,
+				 struct poll_table_struct *wait);
+ssize_t iio_buffer_read_wrapper(struct file *filp, char __user *buf,
+				size_t n, loff_t *f_ps);
 
 int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev);
 void iio_buffers_free_sysfs_and_mask(struct iio_dev *indio_dev);
 
-#define iio_buffer_poll_addr (&iio_buffer_poll)
-#define iio_buffer_read_outer_addr (&iio_buffer_read_outer)
+#define iio_buffer_poll_addr (&iio_buffer_poll_wrapper)
+#define iio_buffer_read_outer_addr (&iio_buffer_read_wrapper)
 
 void iio_disable_all_buffers(struct iio_dev *indio_dev);
 void iio_buffer_wakeup_poll(struct iio_dev *indio_dev);
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index b60c2e66bd1c..a48e494a9fbb 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -9,9 +9,11 @@
  * - Better memory allocation techniques?
  * - Alternative access techniques?
  */
+#include <linux/anon_inodes.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/device.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/cdev.h>
 #include <linux/slab.h>
@@ -89,7 +91,7 @@ static bool iio_buffer_ready(struct iio_dev *indio_dev, struct iio_buffer *buf,
 }
 
 /**
- * iio_buffer_read_outer() - chrdev read for buffer access
+ * iio_buffer_read() - chrdev read for buffer access
  * @filp:	File structure pointer for the char device
  * @buf:	Destination buffer for iio buffer read
  * @n:		First n bytes to read
@@ -101,8 +103,8 @@ static bool iio_buffer_ready(struct iio_dev *indio_dev, struct iio_buffer *buf,
  * Return: negative values corresponding to error codes or ret != 0
  *	   for ending the reading activity
  **/
-ssize_t iio_buffer_read_outer(struct file *filp, char __user *buf,
-			      size_t n, loff_t *f_ps)
+static ssize_t iio_buffer_read(struct file *filp, char __user *buf,
+			       size_t n, loff_t *f_ps)
 {
 	struct iio_dev_buffer_pair *ib = filp->private_data;
 	struct iio_buffer *rb = ib->buffer;
@@ -168,8 +170,8 @@ ssize_t iio_buffer_read_outer(struct file *filp, char __user *buf,
  * Return: (EPOLLIN | EPOLLRDNORM) if data is available for reading
  *	   or 0 for other cases
  */
-__poll_t iio_buffer_poll(struct file *filp,
-			     struct poll_table_struct *wait)
+static __poll_t iio_buffer_poll(struct file *filp,
+				struct poll_table_struct *wait)
 {
 	struct iio_dev_buffer_pair *ib = filp->private_data;
 	struct iio_buffer *rb = ib->buffer;
@@ -184,6 +186,32 @@ __poll_t iio_buffer_poll(struct file *filp,
 	return 0;
 }
 
+ssize_t iio_buffer_read_wrapper(struct file *filp, char __user *buf,
+				size_t n, loff_t *f_ps)
+{
+	struct iio_dev_buffer_pair *ib = filp->private_data;
+	struct iio_buffer *rb = ib->buffer;
+
+	/* check if buffer was opened through new API */
+	if (test_bit(IIO_BUSY_BIT_POS, &rb->flags))
+		return -EBUSY;
+
+	return iio_buffer_read(filp, buf, n, f_ps);
+}
+
+__poll_t iio_buffer_poll_wrapper(struct file *filp,
+				 struct poll_table_struct *wait)
+{
+	struct iio_dev_buffer_pair *ib = filp->private_data;
+	struct iio_buffer *rb = ib->buffer;
+
+	/* check if buffer was opened through new API */
+	if (test_bit(IIO_BUSY_BIT_POS, &rb->flags))
+		return 0;
+
+	return iio_buffer_poll(filp, wait);
+}
+
 /**
  * iio_buffer_wakeup_poll - Wakes up the buffer waitqueue
  * @indio_dev: The IIO device
@@ -1344,6 +1372,96 @@ static void iio_buffer_unregister_legacy_sysfs_groups(struct iio_dev *indio_dev)
 	kfree(iio_dev_opaque->legacy_scan_el_group.attrs);
 }
 
+static int iio_buffer_chrdev_release(struct inode *inode, struct file *filep)
+{
+	struct iio_dev_buffer_pair *ib = filep->private_data;
+	struct iio_dev *indio_dev = ib->indio_dev;
+	struct iio_buffer *buffer = ib->buffer;
+
+	wake_up(&buffer->pollq);
+
+	kfree(ib);
+	clear_bit(IIO_BUSY_BIT_POS, &buffer->flags);
+	iio_device_put(indio_dev);
+
+	return 0;
+}
+
+static const struct file_operations iio_buffer_chrdev_fileops = {
+	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
+	.read = iio_buffer_read,
+	.poll = iio_buffer_poll,
+	.release = iio_buffer_chrdev_release,
+};
+
+static long iio_device_buffer_getfd(struct iio_dev *indio_dev, unsigned long arg)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	int __user *ival = (int __user *)arg;
+	struct iio_dev_buffer_pair *ib;
+	struct iio_buffer *buffer;
+	int fd, idx, ret;
+
+	if (copy_from_user(&idx, ival, sizeof(idx)))
+		return -EFAULT;
+
+	if (idx >= iio_dev_opaque->attached_buffers_cnt)
+		return -ENODEV;
+
+	iio_device_get(indio_dev);
+
+	buffer = iio_dev_opaque->attached_buffers[idx];
+
+	if (test_and_set_bit(IIO_BUSY_BIT_POS, &buffer->flags)) {
+		ret = -EBUSY;
+		goto error_iio_dev_put;
+	}
+
+	ib = kzalloc(sizeof(*ib), GFP_KERNEL);
+	if (!ib) {
+		ret = -ENOMEM;
+		goto error_clear_busy_bit;
+	}
+
+	ib->indio_dev = indio_dev;
+	ib->buffer = buffer;
+
+	fd = anon_inode_getfd("iio:buffer", &iio_buffer_chrdev_fileops,
+			      ib, O_RDWR | O_CLOEXEC);
+	if (fd < 0) {
+		ret = fd;
+		goto error_free_ib;
+	}
+
+	if (copy_to_user(ival, &fd, sizeof(fd))) {
+		put_unused_fd(fd);
+		ret = -EFAULT;
+		goto error_free_ib;
+	}
+
+	return fd;
+
+error_free_ib:
+	kfree(ib);
+error_clear_busy_bit:
+	clear_bit(IIO_BUSY_BIT_POS, &buffer->flags);
+error_iio_dev_put:
+	iio_device_put(indio_dev);
+	return ret;
+}
+
+static long iio_device_buffer_ioctl(struct iio_dev *indio_dev, struct file *filp,
+				    unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case IIO_BUFFER_GET_FD_IOCTL:
+		return iio_device_buffer_getfd(indio_dev, arg);
+	default:
+		return IIO_IOCTL_UNHANDLED;
+	}
+}
+
 static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
 					     struct iio_dev *indio_dev,
 					     int index)
@@ -1473,6 +1591,7 @@ int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 	struct iio_buffer *buffer;
 	int unwind_idx;
 	int ret, i;
+	size_t sz;
 
 	channels = indio_dev->channels;
 	if (channels) {
@@ -1494,6 +1613,18 @@ int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 			goto error_unwind_sysfs_and_mask;
 		}
 	}
+	unwind_idx = iio_dev_opaque->attached_buffers_cnt - 1;
+
+	sz = sizeof(*(iio_dev_opaque->buffer_ioctl_handler));
+	iio_dev_opaque->buffer_ioctl_handler = kzalloc(sz, GFP_KERNEL);
+	if (!iio_dev_opaque->buffer_ioctl_handler) {
+		ret = -ENOMEM;
+		goto error_unwind_sysfs_and_mask;
+	}
+
+	iio_dev_opaque->buffer_ioctl_handler->ioctl = iio_device_buffer_ioctl;
+	iio_device_ioctl_handler_register(indio_dev,
+					  iio_dev_opaque->buffer_ioctl_handler);
 
 	return 0;
 
@@ -1515,6 +1646,9 @@ void iio_buffers_free_sysfs_and_mask(struct iio_dev *indio_dev)
 	if (!iio_dev_opaque->attached_buffers_cnt)
 		return;
 
+	iio_device_ioctl_handler_unregister(iio_dev_opaque->buffer_ioctl_handler);
+	kfree(iio_dev_opaque->buffer_ioctl_handler);
+
 	iio_buffer_unregister_legacy_sysfs_groups(indio_dev);
 
 	for (i = iio_dev_opaque->attached_buffers_cnt - 1; i >= 0; i--) {
diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
index 768b90c64412..245b32918ae1 100644
--- a/include/linux/iio/buffer_impl.h
+++ b/include/linux/iio/buffer_impl.h
@@ -6,6 +6,8 @@
 
 #ifdef CONFIG_IIO_BUFFER
 
+#include <uapi/linux/iio/buffer.h>
+
 struct iio_dev;
 struct iio_buffer;
 
@@ -72,6 +74,9 @@ struct iio_buffer {
 	/** @length: Number of datums in buffer. */
 	unsigned int length;
 
+	/** @flags: File ops flags including busy flag. */
+	unsigned long flags;
+
 	/**  @bytes_per_datum: Size of individual datum including timestamp. */
 	size_t bytes_per_datum;
 
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index b6ebc04af3e7..32addd5e790e 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -9,6 +9,7 @@
  * @event_interface:		event chrdevs associated with interrupt lines
  * @attached_buffers:		array of buffers statically attached by the driver
  * @attached_buffers_cnt:	number of buffers in the array of statically attached buffers
+ * @buffer_ioctl_handler:	ioctl() handler for this IIO device's buffer interface
  * @buffer_list:		list of all buffers currently attached
  * @channel_attr_list:		keep track of automatically created channel
  *				attributes
@@ -28,6 +29,7 @@ struct iio_dev_opaque {
 	struct iio_event_interface	*event_interface;
 	struct iio_buffer		**attached_buffers;
 	unsigned int			attached_buffers_cnt;
+	struct iio_ioctl_handler	*buffer_ioctl_handler;
 	struct list_head		buffer_list;
 	struct list_head		channel_attr_list;
 	struct attribute_group		chan_attr_group;
diff --git a/include/uapi/linux/iio/buffer.h b/include/uapi/linux/iio/buffer.h
new file mode 100644
index 000000000000..13939032b3f6
--- /dev/null
+++ b/include/uapi/linux/iio/buffer.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* industrial I/O buffer definitions needed both in and out of kernel
+ */
+
+#ifndef _UAPI_IIO_BUFFER_H_
+#define _UAPI_IIO_BUFFER_H_
+
+#define IIO_BUFFER_GET_FD_IOCTL			_IOWR('i', 0x91, int)
+
+#endif /* _UAPI_IIO_BUFFER_H_ */
-- 
cgit v1.2.3-71-gd317


From 710ec5622306de8c071637ee41ddf4c9bd17e75a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Thu, 11 Mar 2021 19:03:15 +0100
Subject: nexthop: Add netlink defines and enumerators for resilient NH groups

- RTM_NEWNEXTHOP et.al. that handle resilient groups will have a new nested
  attribute, NHA_RES_GROUP, whose elements are attributes NHA_RES_GROUP_*.

- RTM_NEWNEXTHOPBUCKET et.al. is a suite of new messages that will
  currently serve only for dumping of individual buckets of resilient next
  hop groups. For nexthop group buckets, these messages will carry a nested
  attribute NHA_RES_BUCKET, whose elements are attributes NHA_RES_BUCKET_*.

  There are several reasons why a new suite of messages is created for
  nexthop buckets instead of overloading the information on the existing
  RTM_{NEW,DEL,GET}NEXTHOP messages.

  First, a nexthop group can contain a large number of nexthop buckets (4k
  is not unheard of). This imposes limits on the amount of information that
  can be encoded for each nexthop bucket given a netlink message is limited
  to 64k bytes.

  Second, while RTM_NEWNEXTHOPBUCKET is only used for notifications at
  this point, in the future it can be extended to provide user space with
  control over nexthop buckets configuration.

- The new group type is NEXTHOP_GRP_TYPE_RES. Note that nexthop code is
  adjusted to bounce groups with that type for now.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/nexthop.h   | 47 +++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/rtnetlink.h |  7 +++++++
 net/ipv4/nexthop.c             |  2 ++
 security/selinux/nlmsgtab.c    |  5 ++++-
 4 files changed, 59 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
index 2d4a1e784cf0..d8ffa8c9ca78 100644
--- a/include/uapi/linux/nexthop.h
+++ b/include/uapi/linux/nexthop.h
@@ -21,7 +21,10 @@ struct nexthop_grp {
 };
 
 enum {
-	NEXTHOP_GRP_TYPE_MPATH,  /* default type if not specified */
+	NEXTHOP_GRP_TYPE_MPATH,  /* hash-threshold nexthop group
+				  * default type if not specified
+				  */
+	NEXTHOP_GRP_TYPE_RES,    /* resilient nexthop group */
 	__NEXTHOP_GRP_TYPE_MAX,
 };
 
@@ -52,8 +55,50 @@ enum {
 	NHA_FDB,	/* flag; nexthop belongs to a bridge fdb */
 	/* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */
 
+	/* nested; resilient nexthop group attributes */
+	NHA_RES_GROUP,
+	/* nested; nexthop bucket attributes */
+	NHA_RES_BUCKET,
+
 	__NHA_MAX,
 };
 
 #define NHA_MAX	(__NHA_MAX - 1)
+
+enum {
+	NHA_RES_GROUP_UNSPEC,
+	/* Pad attribute for 64-bit alignment. */
+	NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC,
+
+	/* u16; number of nexthop buckets in a resilient nexthop group */
+	NHA_RES_GROUP_BUCKETS,
+	/* clock_t as u32; nexthop bucket idle timer (per-group) */
+	NHA_RES_GROUP_IDLE_TIMER,
+	/* clock_t as u32; nexthop unbalanced timer */
+	NHA_RES_GROUP_UNBALANCED_TIMER,
+	/* clock_t as u64; nexthop unbalanced time */
+	NHA_RES_GROUP_UNBALANCED_TIME,
+
+	__NHA_RES_GROUP_MAX,
+};
+
+#define NHA_RES_GROUP_MAX	(__NHA_RES_GROUP_MAX - 1)
+
+enum {
+	NHA_RES_BUCKET_UNSPEC,
+	/* Pad attribute for 64-bit alignment. */
+	NHA_RES_BUCKET_PAD = NHA_RES_BUCKET_UNSPEC,
+
+	/* u16; nexthop bucket index */
+	NHA_RES_BUCKET_INDEX,
+	/* clock_t as u64; nexthop bucket idle time */
+	NHA_RES_BUCKET_IDLE_TIME,
+	/* u32; nexthop id assigned to the nexthop bucket */
+	NHA_RES_BUCKET_NH_ID,
+
+	__NHA_RES_BUCKET_MAX,
+};
+
+#define NHA_RES_BUCKET_MAX	(__NHA_RES_BUCKET_MAX - 1)
+
 #endif
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 91e4ca064d61..d35953bc7d53 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -178,6 +178,13 @@ enum {
 	RTM_GETVLAN,
 #define RTM_GETVLAN	RTM_GETVLAN
 
+	RTM_NEWNEXTHOPBUCKET = 116,
+#define RTM_NEWNEXTHOPBUCKET	RTM_NEWNEXTHOPBUCKET
+	RTM_DELNEXTHOPBUCKET,
+#define RTM_DELNEXTHOPBUCKET	RTM_DELNEXTHOPBUCKET
+	RTM_GETNEXTHOPBUCKET,
+#define RTM_GETNEXTHOPBUCKET	RTM_GETNEXTHOPBUCKET
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 56c54d0fbacc..7a94591da856 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1492,6 +1492,8 @@ static struct nexthop *nexthop_create_group(struct net *net,
 	if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
 		nhg->mpath = 1;
 		nhg->is_multipath = true;
+	} else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) {
+		goto out_no_nh;
 	}
 
 	WARN_ON_ONCE(nhg->mpath != 1);
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index b69231918686..d59276f48d4f 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -88,6 +88,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] =
 	{ RTM_NEWVLAN,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_DELVLAN,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_GETVLAN,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
+	{ RTM_NEWNEXTHOPBUCKET,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_DELNEXTHOPBUCKET,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_GETNEXTHOPBUCKET,	NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 };
 
 static const struct nlmsg_perm nlmsg_tcpdiag_perms[] =
@@ -171,7 +174,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
 		 * structures at the top of this file with the new mappings
 		 * before updating the BUILD_BUG_ON() macro!
 		 */
-		BUILD_BUG_ON(RTM_MAX != (RTM_NEWVLAN + 3));
+		BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOPBUCKET + 3));
 		err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
 				 sizeof(nlmsg_route_perms));
 		break;
-- 
cgit v1.2.3-71-gd317


From 2ffe0395288aa237ff7e0143366bd1cd57bfc5b7 Mon Sep 17 00:00:00 2001
From: Baowen Zheng <baowen.zheng@corigine.com>
Date: Fri, 12 Mar 2021 15:08:31 +0100
Subject: net/sched: act_police: add support for packet-per-second policing

Allow a policer action to enforce a rate-limit based on packets-per-second,
configurable using a packet-per-second rate and burst parameters.

e.g.
tc filter add dev tap1 parent ffff: u32 match \
        u32 0 0 police pkts_rate 3000 pkts_burst 1000

Testing was unable to uncover a performance impact of this change on
existing features.

Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Louis Peens <louis.peens@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h      | 14 ++++++++
 include/net/tc_act/tc_police.h | 48 ++++++++++++++++++++++++---
 include/uapi/linux/pkt_cls.h   |  2 ++
 net/sched/act_police.c         | 59 +++++++++++++++++++++++++++++----
 net/sched/sch_generic.c        | 75 ++++++++++++++++++++++++++++--------------
 5 files changed, 162 insertions(+), 36 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 2d6eb60c58c8..f7a6e14491fb 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1242,6 +1242,20 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
 	res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
 }
 
+struct psched_pktrate {
+	u64	rate_pkts_ps; /* packets per second */
+	u32	mult;
+	u8	shift;
+};
+
+static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r,
+				  unsigned int pkt_num)
+{
+	return ((u64)pkt_num * r->mult) >> r->shift;
+}
+
+void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64);
+
 /* Mini Qdisc serves for specific needs of ingress/clsact Qdisc.
  * The fast path only needs to access filter list and to update stats
  */
diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h
index ae117f7937d5..72649512dcdd 100644
--- a/include/net/tc_act/tc_police.h
+++ b/include/net/tc_act/tc_police.h
@@ -10,10 +10,13 @@ struct tcf_police_params {
 	s64			tcfp_burst;
 	u32			tcfp_mtu;
 	s64			tcfp_mtu_ptoks;
+	s64			tcfp_pkt_burst;
 	struct psched_ratecfg	rate;
 	bool			rate_present;
 	struct psched_ratecfg	peak;
 	bool			peak_present;
+	struct psched_pktrate	ppsrate;
+	bool			pps_present;
 	struct rcu_head rcu;
 };
 
@@ -24,6 +27,7 @@ struct tcf_police {
 	spinlock_t		tcfp_lock ____cacheline_aligned_in_smp;
 	s64			tcfp_toks;
 	s64			tcfp_ptoks;
+	s64			tcfp_pkttoks;
 	s64			tcfp_t_c;
 };
 
@@ -99,14 +103,50 @@ static inline u32 tcf_police_burst(const struct tc_action *act)
 
 static inline u64 tcf_police_rate_pkt_ps(const struct tc_action *act)
 {
-	/* Not implemented */
-	return 0;
+	struct tcf_police *police = to_police(act);
+	struct tcf_police_params *params;
+
+	params = rcu_dereference_protected(police->params,
+					   lockdep_is_held(&police->tcf_lock));
+	return params->ppsrate.rate_pkts_ps;
 }
 
 static inline u32 tcf_police_burst_pkt(const struct tc_action *act)
 {
-	/* Not implemented */
-	return 0;
+	struct tcf_police *police = to_police(act);
+	struct tcf_police_params *params;
+	u32 burst;
+
+	params = rcu_dereference_protected(police->params,
+					   lockdep_is_held(&police->tcf_lock));
+
+	/*
+	 *  "rate" pkts     "burst" nanoseconds
+	 *  ------------ *  -------------------
+	 *    1 second          2^6 ticks
+	 *
+	 * ------------------------------------
+	 *        NSEC_PER_SEC nanoseconds
+	 *        ------------------------
+	 *              2^6 ticks
+	 *
+	 *    "rate" pkts    "burst" nanoseconds            2^6 ticks
+	 *  = ------------ * ------------------- * ------------------------
+	 *      1 second          2^6 ticks        NSEC_PER_SEC nanoseconds
+	 *
+	 *   "rate" * "burst"
+	 * = ---------------- pkts/nanosecond
+	 *    NSEC_PER_SEC^2
+	 *
+	 *
+	 *   "rate" * "burst"
+	 * = ---------------- pkts/second
+	 *     NSEC_PER_SEC
+	 */
+	burst = div_u64(params->tcfp_pkt_burst * params->ppsrate.rate_pkts_ps,
+			NSEC_PER_SEC);
+
+	return burst;
 }
 
 static inline u32 tcf_police_tcfp_mtu(const struct tc_action *act)
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 7ea59cfe1fa7..025c40fef93d 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -190,6 +190,8 @@ enum {
 	TCA_POLICE_PAD,
 	TCA_POLICE_RATE64,
 	TCA_POLICE_PEAKRATE64,
+	TCA_POLICE_PKTRATE64,
+	TCA_POLICE_PKTBURST64,
 	__TCA_POLICE_MAX
 #define TCA_POLICE_RESULT TCA_POLICE_RESULT
 };
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 8d8452b1cdd4..0fab8de176d2 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -42,6 +42,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },
 	[TCA_POLICE_RATE64]     = { .type = NLA_U64 },
 	[TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
+	[TCA_POLICE_PKTRATE64]  = { .type = NLA_U64, .min = 1 },
+	[TCA_POLICE_PKTBURST64] = { .type = NLA_U64, .min = 1 },
 };
 
 static int tcf_police_init(struct net *net, struct nlattr *nla,
@@ -61,6 +63,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	bool exists = false;
 	u32 index;
 	u64 rate64, prate64;
+	u64 pps, ppsburst;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -142,6 +145,21 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 		}
 	}
 
+	if ((tb[TCA_POLICE_PKTRATE64] && !tb[TCA_POLICE_PKTBURST64]) ||
+	    (!tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64])) {
+		NL_SET_ERR_MSG(extack,
+			       "Both or neither packet-per-second burst and rate must be provided");
+		err = -EINVAL;
+		goto failure;
+	}
+
+	if (tb[TCA_POLICE_PKTRATE64] && R_tab) {
+		NL_SET_ERR_MSG(extack,
+			       "packet-per-second and byte-per-second rate limits not allowed in same action");
+		err = -EINVAL;
+		goto failure;
+	}
+
 	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (unlikely(!new)) {
 		err = -ENOMEM;
@@ -183,6 +201,14 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	if (tb[TCA_POLICE_AVRATE])
 		new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
 
+	if (tb[TCA_POLICE_PKTRATE64]) {
+		pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]);
+		ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]);
+		new->pps_present = true;
+		new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst);
+		psched_ppscfg_precompute(&new->ppsrate, pps);
+	}
+
 	spin_lock_bh(&police->tcf_lock);
 	spin_lock_bh(&police->tcfp_lock);
 	police->tcfp_t_c = ktime_get_ns();
@@ -217,8 +243,8 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_police *police = to_police(a);
+	s64 now, toks, ppstoks = 0, ptoks = 0;
 	struct tcf_police_params *p;
-	s64 now, toks, ptoks = 0;
 	int ret;
 
 	tcf_lastuse_update(&police->tcf_tm);
@@ -236,7 +262,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
 	}
 
 	if (qdisc_pkt_len(skb) <= p->tcfp_mtu) {
-		if (!p->rate_present) {
+		if (!p->rate_present && !p->pps_present) {
 			ret = p->tcfp_result;
 			goto end;
 		}
@@ -251,14 +277,23 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
 			ptoks -= (s64)psched_l2t_ns(&p->peak,
 						    qdisc_pkt_len(skb));
 		}
-		toks += police->tcfp_toks;
-		if (toks > p->tcfp_burst)
-			toks = p->tcfp_burst;
-		toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
-		if ((toks|ptoks) >= 0) {
+		if (p->rate_present) {
+			toks += police->tcfp_toks;
+			if (toks > p->tcfp_burst)
+				toks = p->tcfp_burst;
+			toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
+		} else if (p->pps_present) {
+			ppstoks = min_t(s64, now - police->tcfp_t_c, p->tcfp_pkt_burst);
+			ppstoks += police->tcfp_pkttoks;
+			if (ppstoks > p->tcfp_pkt_burst)
+				ppstoks = p->tcfp_pkt_burst;
+			ppstoks -= (s64)psched_pkt2t_ns(&p->ppsrate, 1);
+		}
+		if ((toks | ptoks | ppstoks) >= 0) {
 			police->tcfp_t_c = now;
 			police->tcfp_toks = toks;
 			police->tcfp_ptoks = ptoks;
+			police->tcfp_pkttoks = ppstoks;
 			spin_unlock_bh(&police->tcfp_lock);
 			ret = p->tcfp_result;
 			goto inc_drops;
@@ -331,6 +366,16 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
 				      TCA_POLICE_PAD))
 			goto nla_put_failure;
 	}
+	if (p->pps_present) {
+		if (nla_put_u64_64bit(skb, TCA_POLICE_PKTRATE64,
+				      police->params->ppsrate.rate_pkts_ps,
+				      TCA_POLICE_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_POLICE_PKTBURST64,
+				      PSCHED_NS2TICKS(p->tcfp_pkt_burst),
+				      TCA_POLICE_PAD))
+			goto nla_put_failure;
+	}
 	if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
 		goto nla_put_failure;
 	if (p->tcfp_result &&
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 49eae93d1489..44991ea726fc 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1325,6 +1325,48 @@ void dev_shutdown(struct net_device *dev)
 	WARN_ON(timer_pending(&dev->watchdog_timer));
 }
 
+/**
+ * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division
+ * @rate:   Rate to compute reciprocal division values of
+ * @mult:   Multiplier for reciprocal division
+ * @shift:  Shift for reciprocal division
+ *
+ * The multiplier and shift for reciprocal division by rate are stored
+ * in mult and shift.
+ *
+ * The deal here is to replace a divide by a reciprocal one
+ * in fast path (a reciprocal divide is a multiply and a shift)
+ *
+ * Normal formula would be :
+ *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
+ *
+ * We compute mult/shift to use instead :
+ *  time_in_ns = (len * mult) >> shift;
+ *
+ * We try to get the highest possible mult value for accuracy,
+ * but have to make sure no overflows will ever happen.
+ *
+ * reciprocal_value() is not used here it doesn't handle 64-bit values.
+ */
+static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift)
+{
+	u64 factor = NSEC_PER_SEC;
+
+	*mult = 1;
+	*shift = 0;
+
+	if (rate <= 0)
+		return;
+
+	for (;;) {
+		*mult = div64_u64(factor, rate);
+		if (*mult & (1U << 31) || factor & (1ULL << 63))
+			break;
+		factor <<= 1;
+		(*shift)++;
+	}
+}
+
 void psched_ratecfg_precompute(struct psched_ratecfg *r,
 			       const struct tc_ratespec *conf,
 			       u64 rate64)
@@ -1333,34 +1375,17 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r,
 	r->overhead = conf->overhead;
 	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
 	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
-	r->mult = 1;
-	/*
-	 * The deal here is to replace a divide by a reciprocal one
-	 * in fast path (a reciprocal divide is a multiply and a shift)
-	 *
-	 * Normal formula would be :
-	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
-	 *
-	 * We compute mult/shift to use instead :
-	 *  time_in_ns = (len * mult) >> shift;
-	 *
-	 * We try to get the highest possible mult value for accuracy,
-	 * but have to make sure no overflows will ever happen.
-	 */
-	if (r->rate_bytes_ps > 0) {
-		u64 factor = NSEC_PER_SEC;
-
-		for (;;) {
-			r->mult = div64_u64(factor, r->rate_bytes_ps);
-			if (r->mult & (1U << 31) || factor & (1ULL << 63))
-				break;
-			factor <<= 1;
-			r->shift++;
-		}
-	}
+	psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift);
 }
 EXPORT_SYMBOL(psched_ratecfg_precompute);
 
+void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64)
+{
+	r->rate_pkts_ps = pktrate64;
+	psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift);
+}
+EXPORT_SYMBOL(psched_ppscfg_precompute);
+
 static void mini_qdisc_rcu_func(struct rcu_head *head)
 {
 }
-- 
cgit v1.2.3-71-gd317


From 07e1a5809b595df6e125504dff6245cb2c8ed3de Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Sun, 14 Mar 2021 14:19:31 +0200
Subject: psample: Add additional metadata attributes

Extend psample to report the following attributes when available:

* Output traffic class as a 16-bit value
* Output traffic class occupancy in bytes as a 64-bit value
* End-to-end latency of the packet in nanoseconds resolution
* Software timestamp in nanoseconds resolution (always available)
* Packet's protocol. Needed for packet dissection in user space (always
  available)

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/psample.h        |  7 +++++++
 include/uapi/linux/psample.h |  7 +++++++
 net/psample/psample.c        | 39 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/psample.h b/include/net/psample.h
index ac6dbfb3870d..e328c5127757 100644
--- a/include/net/psample.h
+++ b/include/net/psample.h
@@ -18,6 +18,13 @@ struct psample_metadata {
 	u32 trunc_size;
 	int in_ifindex;
 	int out_ifindex;
+	u16 out_tc;
+	u64 out_tc_occ;	/* bytes */
+	u64 latency;	/* nanoseconds */
+	u8 out_tc_valid:1,
+	   out_tc_occ_valid:1,
+	   latency_valid:1,
+	   unused:5;
 };
 
 struct psample_group *psample_group_get(struct net *net, u32 group_num);
diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h
index aea26ab1431c..c6329f71b939 100644
--- a/include/uapi/linux/psample.h
+++ b/include/uapi/linux/psample.h
@@ -16,6 +16,13 @@ enum {
 	/* commands attributes */
 	PSAMPLE_ATTR_GROUP_REFCOUNT,
 
+	PSAMPLE_ATTR_PAD,
+	PSAMPLE_ATTR_OUT_TC,		/* u16 */
+	PSAMPLE_ATTR_OUT_TC_OCC,	/* u64, bytes */
+	PSAMPLE_ATTR_LATENCY,		/* u64, nanoseconds */
+	PSAMPLE_ATTR_TIMESTAMP,		/* u64, nanoseconds */
+	PSAMPLE_ATTR_PROTO,		/* u16 */
+
 	__PSAMPLE_ATTR_MAX
 };
 
diff --git a/net/psample/psample.c b/net/psample/psample.c
index 065bc887d239..118d5d2a81a0 100644
--- a/net/psample/psample.c
+++ b/net/psample/psample.c
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/module.h>
+#include <linux/timekeeping.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/netlink.h>
@@ -358,6 +359,7 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info)
 void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
 			   u32 sample_rate, const struct psample_metadata *md)
 {
+	ktime_t tstamp = ktime_get_real();
 	int out_ifindex = md->out_ifindex;
 	int in_ifindex = md->in_ifindex;
 	u32 trunc_size = md->trunc_size;
@@ -372,10 +374,15 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
 
 	meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) +
 		   (out_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+		   (md->out_tc_valid ? nla_total_size(sizeof(u16)) : 0) +
+		   (md->out_tc_occ_valid ? nla_total_size_64bit(sizeof(u64)) : 0) +
+		   (md->latency_valid ? nla_total_size_64bit(sizeof(u64)) : 0) +
 		   nla_total_size(sizeof(u32)) +	/* sample_rate */
 		   nla_total_size(sizeof(u32)) +	/* orig_size */
 		   nla_total_size(sizeof(u32)) +	/* group_num */
-		   nla_total_size(sizeof(u32));		/* seq */
+		   nla_total_size(sizeof(u32)) +	/* seq */
+		   nla_total_size_64bit(sizeof(u64)) +	/* timestamp */
+		   nla_total_size(sizeof(u16));		/* protocol */
 
 #ifdef CONFIG_INET
 	tun_info = skb_tunnel_info(skb);
@@ -425,6 +432,36 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
 	if (unlikely(ret < 0))
 		goto error;
 
+	if (md->out_tc_valid) {
+		ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OUT_TC, md->out_tc);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
+	if (md->out_tc_occ_valid) {
+		ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_OUT_TC_OCC,
+					md->out_tc_occ, PSAMPLE_ATTR_PAD);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
+	if (md->latency_valid) {
+		ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_LATENCY,
+					md->latency, PSAMPLE_ATTR_PAD);
+		if (unlikely(ret < 0))
+			goto error;
+	}
+
+	ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_TIMESTAMP,
+				ktime_to_ns(tstamp), PSAMPLE_ATTR_PAD);
+	if (unlikely(ret < 0))
+		goto error;
+
+	ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_PROTO,
+			  be16_to_cpu(skb->protocol));
+	if (unlikely(ret < 0))
+		goto error;
+
 	if (data_len) {
 		int nla_len = nla_total_size(data_len);
 		struct nlattr *nla;
-- 
cgit v1.2.3-71-gd317


From f8425c9396639cc462bcce44b1051f8b4e62fddb Mon Sep 17 00:00:00 2001
From: Alessio Balsini <balsini@android.com>
Date: Mon, 25 Jan 2021 15:30:51 +0000
Subject: fuse: 32-bit user space ioctl compat for fuse device

With a 64-bit kernel build the FUSE device cannot handle ioctl requests
coming from 32-bit user space.  This is due to the ioctl command
translation that generates different command identifiers that thus cannot
be used for direct comparisons without proper manipulation.

Explicitly extract type and number from the ioctl command to enable 32-bit
user space compatibility on 64-bit kernel builds.

Signed-off-by: Alessio Balsini <balsini@android.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dev.c             | 26 ++++++++++++++++----------
 include/uapi/linux/fuse.h |  3 ++-
 2 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c6636b4c4ccf..c0fee830a34e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2229,19 +2229,21 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
 static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 			   unsigned long arg)
 {
-	int err = -ENOTTY;
+	int res;
+	int oldfd;
+	struct fuse_dev *fud = NULL;
 
-	if (cmd == FUSE_DEV_IOC_CLONE) {
-		int oldfd;
+	if (_IOC_TYPE(cmd) != FUSE_DEV_IOC_MAGIC)
+		return -ENOTTY;
 
-		err = -EFAULT;
-		if (!get_user(oldfd, (__u32 __user *) arg)) {
+	switch (_IOC_NR(cmd)) {
+	case _IOC_NR(FUSE_DEV_IOC_CLONE):
+		res = -EFAULT;
+		if (!get_user(oldfd, (__u32 __user *)arg)) {
 			struct file *old = fget(oldfd);
 
-			err = -EINVAL;
+			res = -EINVAL;
 			if (old) {
-				struct fuse_dev *fud = NULL;
-
 				/*
 				 * Check against file->f_op because CUSE
 				 * uses the same ioctl handler.
@@ -2252,14 +2254,18 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
 
 				if (fud) {
 					mutex_lock(&fuse_mutex);
-					err = fuse_device_clone(fud->fc, file);
+					res = fuse_device_clone(fud->fc, file);
 					mutex_unlock(&fuse_mutex);
 				}
 				fput(old);
 			}
 		}
+		break;
+	default:
+		res = -ENOTTY;
+		break;
 	}
-	return err;
+	return res;
 }
 
 const struct file_operations fuse_dev_operations = {
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 98ca64d1beb6..54442612c48b 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -903,7 +903,8 @@ struct fuse_notify_retrieve_in {
 };
 
 /* Device ioctls: */
-#define FUSE_DEV_IOC_CLONE	_IOR(229, 0, uint32_t)
+#define FUSE_DEV_IOC_MAGIC		229
+#define FUSE_DEV_IOC_CLONE		_IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t)
 
 struct fuse_lseek_in {
 	uint64_t	fh;
-- 
cgit v1.2.3-71-gd317


From 90f093fa8ea48e5d991332cee160b761423d55c1 Mon Sep 17 00:00:00 2001
From: Piotr Figiel <figiel@google.com>
Date: Fri, 26 Feb 2021 14:51:56 +0100
Subject: rseq, ptrace: Add PTRACE_GET_RSEQ_CONFIGURATION request

For userspace checkpoint and restore (C/R) a way of getting process state
containing RSEQ configuration is needed.

There are two ways this information is going to be used:
 - to re-enable RSEQ for threads which had it enabled before C/R
 - to detect if a thread was in a critical section during C/R

Since C/R preserves TLS memory and addresses RSEQ ABI will be restored
using the address registered before C/R.

Detection whether the thread is in a critical section during C/R is needed
to enforce behavior of RSEQ abort during C/R. Attaching with ptrace()
before registers are dumped itself doesn't cause RSEQ abort.
Restoring the instruction pointer within the critical section is
problematic because rseq_cs may get cleared before the control is passed
to the migrated application code leading to RSEQ invariants not being
preserved. C/R code will use RSEQ ABI address to find the abort handler
to which the instruction pointer needs to be set.

To achieve above goals expose the RSEQ ABI address and the signature value
with the new ptrace request PTRACE_GET_RSEQ_CONFIGURATION.

This new ptrace request can also be used by debuggers so they are aware
of stops within restartable sequences in progress.

Signed-off-by: Piotr Figiel <figiel@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Michal Miroslaw <emmir@google.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20210226135156.1081606-1-figiel@google.com
---
 include/uapi/linux/ptrace.h | 10 ++++++++++
 kernel/ptrace.c             | 25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index 83ee45fa634b..3747bf816f9a 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -102,6 +102,16 @@ struct ptrace_syscall_info {
 	};
 };
 
+#define PTRACE_GET_RSEQ_CONFIGURATION	0x420f
+
+struct ptrace_rseq_configuration {
+	__u64 rseq_abi_pointer;
+	__u32 rseq_abi_size;
+	__u32 signature;
+	__u32 flags;
+	__u32 pad;
+};
+
 /*
  * These values are stored in task->ptrace_message
  * by tracehook_report_syscall_* to describe the current syscall-stop.
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 821cf1723814..c71270a1677c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -31,6 +31,7 @@
 #include <linux/cn_proc.h>
 #include <linux/compat.h>
 #include <linux/sched/signal.h>
+#include <linux/minmax.h>
 
 #include <asm/syscall.h>	/* for syscall_get_* */
 
@@ -779,6 +780,24 @@ static int ptrace_peek_siginfo(struct task_struct *child,
 	return ret;
 }
 
+#ifdef CONFIG_RSEQ
+static long ptrace_get_rseq_configuration(struct task_struct *task,
+					  unsigned long size, void __user *data)
+{
+	struct ptrace_rseq_configuration conf = {
+		.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
+		.rseq_abi_size = sizeof(*task->rseq),
+		.signature = task->rseq_sig,
+		.flags = 0,
+	};
+
+	size = min_t(unsigned long, size, sizeof(conf));
+	if (copy_to_user(data, &conf, size))
+		return -EFAULT;
+	return sizeof(conf);
+}
+#endif
+
 #ifdef PTRACE_SINGLESTEP
 #define is_singlestep(request)		((request) == PTRACE_SINGLESTEP)
 #else
@@ -1222,6 +1241,12 @@ int ptrace_request(struct task_struct *child, long request,
 		ret = seccomp_get_metadata(child, addr, datavp);
 		break;
 
+#ifdef CONFIG_RSEQ
+	case PTRACE_GET_RSEQ_CONFIGURATION:
+		ret = ptrace_get_rseq_configuration(child, addr, datavp);
+		break;
+#endif
+
 	default:
 		break;
 	}
-- 
cgit v1.2.3-71-gd317


From 1d609992832e900378b305f9f8dcf0ce8473049e Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Mon, 8 Mar 2021 19:48:17 +0100
Subject: platform/surface: Add DTX driver

The Microsoft Surface Book series devices consist of a so-called
clipboard part (containing the CPU, touchscreen, and primary battery)
and a base part (containing keyboard, secondary battery, and optional
discrete GPU). These parts can be separated, i.e. the clipboard can be
detached and used as tablet.

This detachment process is initiated by pressing a button. On the
Surface Book 2 and 3 (targeted with this commit), the Surface Aggregator
Module (i.e. the embedded controller on those devices) attempts to send
a notification to any listening client driver and waits for further
instructions (i.e. whether the detachment process should continue or be
aborted). If it does not receive a response in a certain time-frame, the
detachment process (by default) continues and the clipboard can be
physically separated. In other words, (by default and) without a driver,
the detachment process takes about 10 seconds to complete.

This commit introduces a driver for this detachment system (called DTX).
This driver allows a user-space daemon to control and influence the
detachment behavior. Specifically, it forwards any detachment requests
to user-space, allows user-space to make such requests itself, and
allows handling of those requests. Requests can be handled by either
aborting, continuing/allowing, or delaying (i.e. resetting the timeout
via a heartbeat commend). The user-space API is implemented via the
/dev/surface/dtx miscdevice.

In addition, user-space can change the default behavior on timeout from
allowing detachment to disallowing it, which is useful if the (optional)
discrete GPU is in use.

Furthermore, this driver allows user-space to receive notifications
about the state of the base, specifically when it is physically removed
(as opposed to detachment requested), in what manner it is connected
(i.e. in reverse-/tent-/studio- or laptop-mode), and what type of base
is connected. Based on this information, the driver also provides a
simple tablet-mode switch (aliasing all modes without keyboard access,
i.e. tablet-mode and studio-mode to its reported tablet-mode).

An implementation of such a user-space daemon, allowing configuration of
detachment behavior via scripts (e.g. safely unmounting USB devices
connected to the base before continuing) can be found at [1].

[1]: https://github.com/linux-surface/surface-dtx-daemon

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20210308184819.437438-2-luzmaximilian@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |    2 +
 MAINTAINERS                                        |    7 +
 drivers/platform/surface/Kconfig                   |   16 +
 drivers/platform/surface/Makefile                  |    1 +
 drivers/platform/surface/surface_dtx.c             | 1201 ++++++++++++++++++++
 include/uapi/linux/surface_aggregator/dtx.h        |  146 +++
 6 files changed, 1373 insertions(+)
 create mode 100644 drivers/platform/surface/surface_dtx.c
 create mode 100644 include/uapi/linux/surface_aggregator/dtx.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 599bd4493944..1c28b8ef6677 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -327,6 +327,8 @@ Code  Seq#    Include File                                           Comments
 0xA4  00-1F  uapi/asm/sgx.h                                          <mailto:linux-sgx@vger.kernel.org>
 0xA5  01     linux/surface_aggregator/cdev.h                         Microsoft Surface Platform System Aggregator
                                                                      <mailto:luzmaximilian@gmail.com>
+0xA5  20-2F  linux/surface_aggregator/dtx.h                          Microsoft Surface DTX driver
+                                                                     <mailto:luzmaximilian@gmail.com>
 0xAA  00-3F  linux/uapi/linux/userfaultfd.h
 0xAB  00-1F  linux/nbd.h
 0xAC  00-1F  linux/raw.h
diff --git a/MAINTAINERS b/MAINTAINERS
index cf4cb8892623..adfc3a437db7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11861,6 +11861,13 @@ F:	drivers/scsi/smartpqi/smartpqi*.[ch]
 F:	include/linux/cciss*.h
 F:	include/uapi/linux/cciss*.h
 
+MICROSOFT SURFACE DTX DRIVER
+M:	Maximilian Luz <luzmaximilian@gmail.com>
+L:	platform-driver-x86@vger.kernel.org
+S:	Maintained
+F:	drivers/platform/surface/surface_dtx.c
+F:	include/uapi/linux/surface_aggregator/dtx.h
+
 MICROSOFT SURFACE GPE LID SUPPORT DRIVER
 M:	Maximilian Luz <luzmaximilian@gmail.com>
 L:	platform-driver-x86@vger.kernel.org
diff --git a/drivers/platform/surface/Kconfig b/drivers/platform/surface/Kconfig
index a045425026ae..98cf564fb17a 100644
--- a/drivers/platform/surface/Kconfig
+++ b/drivers/platform/surface/Kconfig
@@ -104,6 +104,22 @@ config SURFACE_AGGREGATOR_REGISTRY
 	  the respective client devices. Drivers for these devices still need to
 	  be selected via the other options.
 
+config SURFACE_DTX
+	tristate "Surface DTX (Detachment System) Driver"
+	depends on SURFACE_AGGREGATOR
+	depends on INPUT
+	help
+	  Driver for the Surface Book clipboard detachment system (DTX).
+
+	  On the Surface Book series devices, the display part containing the
+	  CPU (called the clipboard) can be detached from the base (containing a
+	  battery, the keyboard, and, optionally, a discrete GPU) by (if
+	  necessary) unlocking and opening the latch connecting both parts.
+
+	  This driver provides a user-space interface that can influence the
+	  behavior of this process, which includes the option to abort it in
+	  case the base is still in use or speed it up in case it is not.
+
 config SURFACE_GPE
 	tristate "Surface GPE/Lid Support Driver"
 	depends on DMI
diff --git a/drivers/platform/surface/Makefile b/drivers/platform/surface/Makefile
index 99372c427b73..32889482de55 100644
--- a/drivers/platform/surface/Makefile
+++ b/drivers/platform/surface/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_SURFACE_ACPI_NOTIFY)	+= surface_acpi_notify.o
 obj-$(CONFIG_SURFACE_AGGREGATOR)	+= aggregator/
 obj-$(CONFIG_SURFACE_AGGREGATOR_CDEV)	+= surface_aggregator_cdev.o
 obj-$(CONFIG_SURFACE_AGGREGATOR_REGISTRY) += surface_aggregator_registry.o
+obj-$(CONFIG_SURFACE_DTX)		+= surface_dtx.o
 obj-$(CONFIG_SURFACE_GPE)		+= surface_gpe.o
 obj-$(CONFIG_SURFACE_HOTPLUG)		+= surface_hotplug.o
 obj-$(CONFIG_SURFACE_PLATFORM_PROFILE)	+= surface_platform_profile.o
diff --git a/drivers/platform/surface/surface_dtx.c b/drivers/platform/surface/surface_dtx.c
new file mode 100644
index 000000000000..1301fab0ea14
--- /dev/null
+++ b/drivers/platform/surface/surface_dtx.c
@@ -0,0 +1,1201 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Surface Book (gen. 2 and later) detachment system (DTX) driver.
+ *
+ * Provides a user-space interface to properly handle clipboard/tablet
+ * (containing screen and processor) detachment from the base of the device
+ * (containing the keyboard and optionally a discrete GPU). Allows to
+ * acknowledge (to speed things up), abort (e.g. in case the dGPU is still in
+ * use), or request detachment via user-space.
+ *
+ * Copyright (C) 2019-2021 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/input.h>
+#include <linux/ioctl.h>
+#include <linux/kernel.h>
+#include <linux/kfifo.h>
+#include <linux/kref.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/poll.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include <linux/surface_aggregator/controller.h>
+#include <linux/surface_aggregator/dtx.h>
+
+
+/* -- SSAM interface. ------------------------------------------------------- */
+
+enum sam_event_cid_bas {
+	SAM_EVENT_CID_DTX_CONNECTION			= 0x0c,
+	SAM_EVENT_CID_DTX_REQUEST			= 0x0e,
+	SAM_EVENT_CID_DTX_CANCEL			= 0x0f,
+	SAM_EVENT_CID_DTX_LATCH_STATUS			= 0x11,
+};
+
+enum ssam_bas_base_state {
+	SSAM_BAS_BASE_STATE_DETACH_SUCCESS		= 0x00,
+	SSAM_BAS_BASE_STATE_ATTACHED			= 0x01,
+	SSAM_BAS_BASE_STATE_NOT_FEASIBLE		= 0x02,
+};
+
+enum ssam_bas_latch_status {
+	SSAM_BAS_LATCH_STATUS_CLOSED			= 0x00,
+	SSAM_BAS_LATCH_STATUS_OPENED			= 0x01,
+	SSAM_BAS_LATCH_STATUS_FAILED_TO_OPEN		= 0x02,
+	SSAM_BAS_LATCH_STATUS_FAILED_TO_REMAIN_OPEN	= 0x03,
+	SSAM_BAS_LATCH_STATUS_FAILED_TO_CLOSE		= 0x04,
+};
+
+enum ssam_bas_cancel_reason {
+	SSAM_BAS_CANCEL_REASON_NOT_FEASIBLE		= 0x00,  /* Low battery. */
+	SSAM_BAS_CANCEL_REASON_TIMEOUT			= 0x02,
+	SSAM_BAS_CANCEL_REASON_FAILED_TO_OPEN		= 0x03,
+	SSAM_BAS_CANCEL_REASON_FAILED_TO_REMAIN_OPEN	= 0x04,
+	SSAM_BAS_CANCEL_REASON_FAILED_TO_CLOSE		= 0x05,
+};
+
+struct ssam_bas_base_info {
+	u8 state;
+	u8 base_id;
+} __packed;
+
+static_assert(sizeof(struct ssam_bas_base_info) == 2);
+
+SSAM_DEFINE_SYNC_REQUEST_N(ssam_bas_latch_lock, {
+	.target_category = SSAM_SSH_TC_BAS,
+	.target_id       = 0x01,
+	.command_id      = 0x06,
+	.instance_id     = 0x00,
+});
+
+SSAM_DEFINE_SYNC_REQUEST_N(ssam_bas_latch_unlock, {
+	.target_category = SSAM_SSH_TC_BAS,
+	.target_id       = 0x01,
+	.command_id      = 0x07,
+	.instance_id     = 0x00,
+});
+
+SSAM_DEFINE_SYNC_REQUEST_N(ssam_bas_latch_request, {
+	.target_category = SSAM_SSH_TC_BAS,
+	.target_id       = 0x01,
+	.command_id      = 0x08,
+	.instance_id     = 0x00,
+});
+
+SSAM_DEFINE_SYNC_REQUEST_N(ssam_bas_latch_confirm, {
+	.target_category = SSAM_SSH_TC_BAS,
+	.target_id       = 0x01,
+	.command_id      = 0x09,
+	.instance_id     = 0x00,
+});
+
+SSAM_DEFINE_SYNC_REQUEST_N(ssam_bas_latch_heartbeat, {
+	.target_category = SSAM_SSH_TC_BAS,
+	.target_id       = 0x01,
+	.command_id      = 0x0a,
+	.instance_id     = 0x00,
+});
+
+SSAM_DEFINE_SYNC_REQUEST_N(ssam_bas_latch_cancel, {
+	.target_category = SSAM_SSH_TC_BAS,
+	.target_id       = 0x01,
+	.command_id      = 0x0b,
+	.instance_id     = 0x00,
+});
+
+SSAM_DEFINE_SYNC_REQUEST_R(ssam_bas_get_base, struct ssam_bas_base_info, {
+	.target_category = SSAM_SSH_TC_BAS,
+	.target_id       = 0x01,
+	.command_id      = 0x0c,
+	.instance_id     = 0x00,
+});
+
+SSAM_DEFINE_SYNC_REQUEST_R(ssam_bas_get_device_mode, u8, {
+	.target_category = SSAM_SSH_TC_BAS,
+	.target_id       = 0x01,
+	.command_id      = 0x0d,
+	.instance_id     = 0x00,
+});
+
+SSAM_DEFINE_SYNC_REQUEST_R(ssam_bas_get_latch_status, u8, {
+	.target_category = SSAM_SSH_TC_BAS,
+	.target_id       = 0x01,
+	.command_id      = 0x11,
+	.instance_id     = 0x00,
+});
+
+
+/* -- Main structures. ------------------------------------------------------ */
+
+enum sdtx_device_state {
+	SDTX_DEVICE_SHUTDOWN_BIT    = BIT(0),
+	SDTX_DEVICE_DIRTY_BASE_BIT  = BIT(1),
+	SDTX_DEVICE_DIRTY_MODE_BIT  = BIT(2),
+	SDTX_DEVICE_DIRTY_LATCH_BIT = BIT(3),
+};
+
+struct sdtx_device {
+	struct kref kref;
+	struct rw_semaphore lock;         /* Guards device and controller reference. */
+
+	struct device *dev;
+	struct ssam_controller *ctrl;
+	unsigned long flags;
+
+	struct miscdevice mdev;
+	wait_queue_head_t waitq;
+	struct mutex write_lock;          /* Guards order of events/notifications. */
+	struct rw_semaphore client_lock;  /* Guards client list.                   */
+	struct list_head client_list;
+
+	struct delayed_work state_work;
+	struct {
+		struct ssam_bas_base_info base;
+		u8 device_mode;
+		u8 latch_status;
+	} state;
+
+	struct delayed_work mode_work;
+	struct input_dev *mode_switch;
+
+	struct ssam_event_notifier notif;
+};
+
+enum sdtx_client_state {
+	SDTX_CLIENT_EVENTS_ENABLED_BIT = BIT(0),
+};
+
+struct sdtx_client {
+	struct sdtx_device *ddev;
+	struct list_head node;
+	unsigned long flags;
+
+	struct fasync_struct *fasync;
+
+	struct mutex read_lock;           /* Guards FIFO buffer read access. */
+	DECLARE_KFIFO(buffer, u8, 512);
+};
+
+static void __sdtx_device_release(struct kref *kref)
+{
+	struct sdtx_device *ddev = container_of(kref, struct sdtx_device, kref);
+
+	mutex_destroy(&ddev->write_lock);
+	kfree(ddev);
+}
+
+static struct sdtx_device *sdtx_device_get(struct sdtx_device *ddev)
+{
+	if (ddev)
+		kref_get(&ddev->kref);
+
+	return ddev;
+}
+
+static void sdtx_device_put(struct sdtx_device *ddev)
+{
+	if (ddev)
+		kref_put(&ddev->kref, __sdtx_device_release);
+}
+
+
+/* -- Firmware value translations. ------------------------------------------ */
+
+static u16 sdtx_translate_base_state(struct sdtx_device *ddev, u8 state)
+{
+	switch (state) {
+	case SSAM_BAS_BASE_STATE_ATTACHED:
+		return SDTX_BASE_ATTACHED;
+
+	case SSAM_BAS_BASE_STATE_DETACH_SUCCESS:
+		return SDTX_BASE_DETACHED;
+
+	case SSAM_BAS_BASE_STATE_NOT_FEASIBLE:
+		return SDTX_DETACH_NOT_FEASIBLE;
+
+	default:
+		dev_err(ddev->dev, "unknown base state: %#04x\n", state);
+		return SDTX_UNKNOWN(state);
+	}
+}
+
+static u16 sdtx_translate_latch_status(struct sdtx_device *ddev, u8 status)
+{
+	switch (status) {
+	case SSAM_BAS_LATCH_STATUS_CLOSED:
+		return SDTX_LATCH_CLOSED;
+
+	case SSAM_BAS_LATCH_STATUS_OPENED:
+		return SDTX_LATCH_OPENED;
+
+	case SSAM_BAS_LATCH_STATUS_FAILED_TO_OPEN:
+		return SDTX_ERR_FAILED_TO_OPEN;
+
+	case SSAM_BAS_LATCH_STATUS_FAILED_TO_REMAIN_OPEN:
+		return SDTX_ERR_FAILED_TO_REMAIN_OPEN;
+
+	case SSAM_BAS_LATCH_STATUS_FAILED_TO_CLOSE:
+		return SDTX_ERR_FAILED_TO_CLOSE;
+
+	default:
+		dev_err(ddev->dev, "unknown latch status: %#04x\n", status);
+		return SDTX_UNKNOWN(status);
+	}
+}
+
+static u16 sdtx_translate_cancel_reason(struct sdtx_device *ddev, u8 reason)
+{
+	switch (reason) {
+	case SSAM_BAS_CANCEL_REASON_NOT_FEASIBLE:
+		return SDTX_DETACH_NOT_FEASIBLE;
+
+	case SSAM_BAS_CANCEL_REASON_TIMEOUT:
+		return SDTX_DETACH_TIMEDOUT;
+
+	case SSAM_BAS_CANCEL_REASON_FAILED_TO_OPEN:
+		return SDTX_ERR_FAILED_TO_OPEN;
+
+	case SSAM_BAS_CANCEL_REASON_FAILED_TO_REMAIN_OPEN:
+		return SDTX_ERR_FAILED_TO_REMAIN_OPEN;
+
+	case SSAM_BAS_CANCEL_REASON_FAILED_TO_CLOSE:
+		return SDTX_ERR_FAILED_TO_CLOSE;
+
+	default:
+		dev_err(ddev->dev, "unknown cancel reason: %#04x\n", reason);
+		return SDTX_UNKNOWN(reason);
+	}
+}
+
+
+/* -- IOCTLs. --------------------------------------------------------------- */
+
+static int sdtx_ioctl_get_base_info(struct sdtx_device *ddev,
+				    struct sdtx_base_info __user *buf)
+{
+	struct ssam_bas_base_info raw;
+	struct sdtx_base_info info;
+	int status;
+
+	lockdep_assert_held_read(&ddev->lock);
+
+	status = ssam_retry(ssam_bas_get_base, ddev->ctrl, &raw);
+	if (status < 0)
+		return status;
+
+	info.state = sdtx_translate_base_state(ddev, raw.state);
+	info.base_id = SDTX_BASE_TYPE_SSH(raw.base_id);
+
+	if (copy_to_user(buf, &info, sizeof(info)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int sdtx_ioctl_get_device_mode(struct sdtx_device *ddev, u16 __user *buf)
+{
+	u8 mode;
+	int status;
+
+	lockdep_assert_held_read(&ddev->lock);
+
+	status = ssam_retry(ssam_bas_get_device_mode, ddev->ctrl, &mode);
+	if (status < 0)
+		return status;
+
+	return put_user(mode, buf);
+}
+
+static int sdtx_ioctl_get_latch_status(struct sdtx_device *ddev, u16 __user *buf)
+{
+	u8 latch;
+	int status;
+
+	lockdep_assert_held_read(&ddev->lock);
+
+	status = ssam_retry(ssam_bas_get_latch_status, ddev->ctrl, &latch);
+	if (status < 0)
+		return status;
+
+	return put_user(sdtx_translate_latch_status(ddev, latch), buf);
+}
+
+static long __surface_dtx_ioctl(struct sdtx_client *client, unsigned int cmd, unsigned long arg)
+{
+	struct sdtx_device *ddev = client->ddev;
+
+	lockdep_assert_held_read(&ddev->lock);
+
+	switch (cmd) {
+	case SDTX_IOCTL_EVENTS_ENABLE:
+		set_bit(SDTX_CLIENT_EVENTS_ENABLED_BIT, &client->flags);
+		return 0;
+
+	case SDTX_IOCTL_EVENTS_DISABLE:
+		clear_bit(SDTX_CLIENT_EVENTS_ENABLED_BIT, &client->flags);
+		return 0;
+
+	case SDTX_IOCTL_LATCH_LOCK:
+		return ssam_retry(ssam_bas_latch_lock, ddev->ctrl);
+
+	case SDTX_IOCTL_LATCH_UNLOCK:
+		return ssam_retry(ssam_bas_latch_unlock, ddev->ctrl);
+
+	case SDTX_IOCTL_LATCH_REQUEST:
+		return ssam_retry(ssam_bas_latch_request, ddev->ctrl);
+
+	case SDTX_IOCTL_LATCH_CONFIRM:
+		return ssam_retry(ssam_bas_latch_confirm, ddev->ctrl);
+
+	case SDTX_IOCTL_LATCH_HEARTBEAT:
+		return ssam_retry(ssam_bas_latch_heartbeat, ddev->ctrl);
+
+	case SDTX_IOCTL_LATCH_CANCEL:
+		return ssam_retry(ssam_bas_latch_cancel, ddev->ctrl);
+
+	case SDTX_IOCTL_GET_BASE_INFO:
+		return sdtx_ioctl_get_base_info(ddev, (struct sdtx_base_info __user *)arg);
+
+	case SDTX_IOCTL_GET_DEVICE_MODE:
+		return sdtx_ioctl_get_device_mode(ddev, (u16 __user *)arg);
+
+	case SDTX_IOCTL_GET_LATCH_STATUS:
+		return sdtx_ioctl_get_latch_status(ddev, (u16 __user *)arg);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static long surface_dtx_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct sdtx_client *client = file->private_data;
+	long status;
+
+	if (down_read_killable(&client->ddev->lock))
+		return -ERESTARTSYS;
+
+	if (test_bit(SDTX_DEVICE_SHUTDOWN_BIT, &client->ddev->flags)) {
+		up_read(&client->ddev->lock);
+		return -ENODEV;
+	}
+
+	status = __surface_dtx_ioctl(client, cmd, arg);
+
+	up_read(&client->ddev->lock);
+	return status;
+}
+
+
+/* -- File operations. ------------------------------------------------------ */
+
+static int surface_dtx_open(struct inode *inode, struct file *file)
+{
+	struct sdtx_device *ddev = container_of(file->private_data, struct sdtx_device, mdev);
+	struct sdtx_client *client;
+
+	/* Initialize client. */
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (!client)
+		return -ENOMEM;
+
+	client->ddev = sdtx_device_get(ddev);
+
+	INIT_LIST_HEAD(&client->node);
+
+	mutex_init(&client->read_lock);
+	INIT_KFIFO(client->buffer);
+
+	file->private_data = client;
+
+	/* Attach client. */
+	down_write(&ddev->client_lock);
+
+	/*
+	 * Do not add a new client if the device has been shut down. Note that
+	 * it's enough to hold the client_lock here as, during shutdown, we
+	 * only acquire that lock and remove clients after marking the device
+	 * as shut down.
+	 */
+	if (test_bit(SDTX_DEVICE_SHUTDOWN_BIT, &ddev->flags)) {
+		up_write(&ddev->client_lock);
+		sdtx_device_put(client->ddev);
+		kfree(client);
+		return -ENODEV;
+	}
+
+	list_add_tail(&client->node, &ddev->client_list);
+	up_write(&ddev->client_lock);
+
+	stream_open(inode, file);
+	return 0;
+}
+
+static int surface_dtx_release(struct inode *inode, struct file *file)
+{
+	struct sdtx_client *client = file->private_data;
+
+	/* Detach client. */
+	down_write(&client->ddev->client_lock);
+	list_del(&client->node);
+	up_write(&client->ddev->client_lock);
+
+	/* Free client. */
+	sdtx_device_put(client->ddev);
+	mutex_destroy(&client->read_lock);
+	kfree(client);
+
+	return 0;
+}
+
+static ssize_t surface_dtx_read(struct file *file, char __user *buf, size_t count, loff_t *offs)
+{
+	struct sdtx_client *client = file->private_data;
+	struct sdtx_device *ddev = client->ddev;
+	unsigned int copied;
+	int status = 0;
+
+	if (down_read_killable(&ddev->lock))
+		return -ERESTARTSYS;
+
+	/* Make sure we're not shut down. */
+	if (test_bit(SDTX_DEVICE_SHUTDOWN_BIT, &ddev->flags)) {
+		up_read(&ddev->lock);
+		return -ENODEV;
+	}
+
+	do {
+		/* Check availability, wait if necessary. */
+		if (kfifo_is_empty(&client->buffer)) {
+			up_read(&ddev->lock);
+
+			if (file->f_flags & O_NONBLOCK)
+				return -EAGAIN;
+
+			status = wait_event_interruptible(ddev->waitq,
+							  !kfifo_is_empty(&client->buffer) ||
+							  test_bit(SDTX_DEVICE_SHUTDOWN_BIT,
+								   &ddev->flags));
+			if (status < 0)
+				return status;
+
+			if (down_read_killable(&client->ddev->lock))
+				return -ERESTARTSYS;
+
+			/* Need to check that we're not shut down again. */
+			if (test_bit(SDTX_DEVICE_SHUTDOWN_BIT, &ddev->flags)) {
+				up_read(&ddev->lock);
+				return -ENODEV;
+			}
+		}
+
+		/* Try to read from FIFO. */
+		if (mutex_lock_interruptible(&client->read_lock)) {
+			up_read(&ddev->lock);
+			return -ERESTARTSYS;
+		}
+
+		status = kfifo_to_user(&client->buffer, buf, count, &copied);
+		mutex_unlock(&client->read_lock);
+
+		if (status < 0) {
+			up_read(&ddev->lock);
+			return status;
+		}
+
+		/* We might not have gotten anything, check this here. */
+		if (copied == 0 && (file->f_flags & O_NONBLOCK)) {
+			up_read(&ddev->lock);
+			return -EAGAIN;
+		}
+	} while (copied == 0);
+
+	up_read(&ddev->lock);
+	return copied;
+}
+
+static __poll_t surface_dtx_poll(struct file *file, struct poll_table_struct *pt)
+{
+	struct sdtx_client *client = file->private_data;
+	__poll_t events = 0;
+
+	if (down_read_killable(&client->ddev->lock))
+		return -ERESTARTSYS;
+
+	if (test_bit(SDTX_DEVICE_SHUTDOWN_BIT, &client->ddev->flags)) {
+		up_read(&client->ddev->lock);
+		return EPOLLHUP | EPOLLERR;
+	}
+
+	poll_wait(file, &client->ddev->waitq, pt);
+
+	if (!kfifo_is_empty(&client->buffer))
+		events |= EPOLLIN | EPOLLRDNORM;
+
+	up_read(&client->ddev->lock);
+	return events;
+}
+
+static int surface_dtx_fasync(int fd, struct file *file, int on)
+{
+	struct sdtx_client *client = file->private_data;
+
+	return fasync_helper(fd, file, on, &client->fasync);
+}
+
+static const struct file_operations surface_dtx_fops = {
+	.owner          = THIS_MODULE,
+	.open           = surface_dtx_open,
+	.release        = surface_dtx_release,
+	.read           = surface_dtx_read,
+	.poll           = surface_dtx_poll,
+	.fasync         = surface_dtx_fasync,
+	.unlocked_ioctl = surface_dtx_ioctl,
+	.compat_ioctl   = surface_dtx_ioctl,
+	.llseek         = no_llseek,
+};
+
+
+/* -- Event handling/forwarding. -------------------------------------------- */
+
+/*
+ * The device operation mode is not immediately updated on the EC when the
+ * base has been connected, i.e. querying the device mode inside the
+ * connection event callback yields an outdated value. Thus, we can only
+ * determine the new tablet-mode switch and device mode values after some
+ * time.
+ *
+ * These delays have been chosen by experimenting. We first delay on connect
+ * events, then check and validate the device mode against the base state and
+ * if invalid delay again by the "recheck" delay.
+ */
+#define SDTX_DEVICE_MODE_DELAY_CONNECT	msecs_to_jiffies(100)
+#define SDTX_DEVICE_MODE_DELAY_RECHECK	msecs_to_jiffies(100)
+
+struct sdtx_status_event {
+	struct sdtx_event e;
+	__u16 v;
+} __packed;
+
+struct sdtx_base_info_event {
+	struct sdtx_event e;
+	struct sdtx_base_info v;
+} __packed;
+
+union sdtx_generic_event {
+	struct sdtx_event common;
+	struct sdtx_status_event status;
+	struct sdtx_base_info_event base;
+};
+
+static void sdtx_update_device_mode(struct sdtx_device *ddev, unsigned long delay);
+
+/* Must be executed with ddev->write_lock held. */
+static void sdtx_push_event(struct sdtx_device *ddev, struct sdtx_event *evt)
+{
+	const size_t len = sizeof(struct sdtx_event) + evt->length;
+	struct sdtx_client *client;
+
+	lockdep_assert_held(&ddev->write_lock);
+
+	down_read(&ddev->client_lock);
+	list_for_each_entry(client, &ddev->client_list, node) {
+		if (!test_bit(SDTX_CLIENT_EVENTS_ENABLED_BIT, &client->flags))
+			continue;
+
+		if (likely(kfifo_avail(&client->buffer) >= len))
+			kfifo_in(&client->buffer, (const u8 *)evt, len);
+		else
+			dev_warn(ddev->dev, "event buffer overrun\n");
+
+		kill_fasync(&client->fasync, SIGIO, POLL_IN);
+	}
+	up_read(&ddev->client_lock);
+
+	wake_up_interruptible(&ddev->waitq);
+}
+
+static u32 sdtx_notifier(struct ssam_event_notifier *nf, const struct ssam_event *in)
+{
+	struct sdtx_device *ddev = container_of(nf, struct sdtx_device, notif);
+	union sdtx_generic_event event;
+	size_t len;
+
+	/* Validate event payload length. */
+	switch (in->command_id) {
+	case SAM_EVENT_CID_DTX_CONNECTION:
+		len = 2 * sizeof(u8);
+		break;
+
+	case SAM_EVENT_CID_DTX_REQUEST:
+		len = 0;
+		break;
+
+	case SAM_EVENT_CID_DTX_CANCEL:
+		len = sizeof(u8);
+		break;
+
+	case SAM_EVENT_CID_DTX_LATCH_STATUS:
+		len = sizeof(u8);
+		break;
+
+	default:
+		return 0;
+	};
+
+	if (in->length != len) {
+		dev_err(ddev->dev,
+			"unexpected payload size for event %#04x: got %u, expected %zu\n",
+			in->command_id, in->length, len);
+		return 0;
+	}
+
+	mutex_lock(&ddev->write_lock);
+
+	/* Translate event. */
+	switch (in->command_id) {
+	case SAM_EVENT_CID_DTX_CONNECTION:
+		clear_bit(SDTX_DEVICE_DIRTY_BASE_BIT, &ddev->flags);
+
+		/* If state has not changed: do not send new event. */
+		if (ddev->state.base.state == in->data[0] &&
+		    ddev->state.base.base_id == in->data[1])
+			goto out;
+
+		ddev->state.base.state = in->data[0];
+		ddev->state.base.base_id = in->data[1];
+
+		event.base.e.length = sizeof(struct sdtx_base_info);
+		event.base.e.code = SDTX_EVENT_BASE_CONNECTION;
+		event.base.v.state = sdtx_translate_base_state(ddev, in->data[0]);
+		event.base.v.base_id = SDTX_BASE_TYPE_SSH(in->data[1]);
+		break;
+
+	case SAM_EVENT_CID_DTX_REQUEST:
+		event.common.code = SDTX_EVENT_REQUEST;
+		event.common.length = 0;
+		break;
+
+	case SAM_EVENT_CID_DTX_CANCEL:
+		event.status.e.length = sizeof(u16);
+		event.status.e.code = SDTX_EVENT_CANCEL;
+		event.status.v = sdtx_translate_cancel_reason(ddev, in->data[0]);
+		break;
+
+	case SAM_EVENT_CID_DTX_LATCH_STATUS:
+		clear_bit(SDTX_DEVICE_DIRTY_LATCH_BIT, &ddev->flags);
+
+		/* If state has not changed: do not send new event. */
+		if (ddev->state.latch_status == in->data[0])
+			goto out;
+
+		ddev->state.latch_status = in->data[0];
+
+		event.status.e.length = sizeof(u16);
+		event.status.e.code = SDTX_EVENT_LATCH_STATUS;
+		event.status.v = sdtx_translate_latch_status(ddev, in->data[0]);
+		break;
+	}
+
+	sdtx_push_event(ddev, &event.common);
+
+	/* Update device mode on base connection change. */
+	if (in->command_id == SAM_EVENT_CID_DTX_CONNECTION) {
+		unsigned long delay;
+
+		delay = in->data[0] ? SDTX_DEVICE_MODE_DELAY_CONNECT : 0;
+		sdtx_update_device_mode(ddev, delay);
+	}
+
+out:
+	mutex_unlock(&ddev->write_lock);
+	return SSAM_NOTIF_HANDLED;
+}
+
+
+/* -- State update functions. ----------------------------------------------- */
+
+static bool sdtx_device_mode_invalid(u8 mode, u8 base_state)
+{
+	return ((base_state == SSAM_BAS_BASE_STATE_ATTACHED) &&
+		(mode == SDTX_DEVICE_MODE_TABLET)) ||
+	       ((base_state == SSAM_BAS_BASE_STATE_DETACH_SUCCESS) &&
+		(mode != SDTX_DEVICE_MODE_TABLET));
+}
+
+static void sdtx_device_mode_workfn(struct work_struct *work)
+{
+	struct sdtx_device *ddev = container_of(work, struct sdtx_device, mode_work.work);
+	struct sdtx_status_event event;
+	struct ssam_bas_base_info base;
+	int status, tablet;
+	u8 mode;
+
+	/* Get operation mode. */
+	status = ssam_retry(ssam_bas_get_device_mode, ddev->ctrl, &mode);
+	if (status) {
+		dev_err(ddev->dev, "failed to get device mode: %d\n", status);
+		return;
+	}
+
+	/* Get base info. */
+	status = ssam_retry(ssam_bas_get_base, ddev->ctrl, &base);
+	if (status) {
+		dev_err(ddev->dev, "failed to get base info: %d\n", status);
+		return;
+	}
+
+	/*
+	 * In some cases (specifically when attaching the base), the device
+	 * mode isn't updated right away. Thus we check if the device mode
+	 * makes sense for the given base state and try again later if it
+	 * doesn't.
+	 */
+	if (sdtx_device_mode_invalid(mode, base.state)) {
+		dev_dbg(ddev->dev, "device mode is invalid, trying again\n");
+		sdtx_update_device_mode(ddev, SDTX_DEVICE_MODE_DELAY_RECHECK);
+		return;
+	}
+
+	mutex_lock(&ddev->write_lock);
+	clear_bit(SDTX_DEVICE_DIRTY_MODE_BIT, &ddev->flags);
+
+	/* Avoid sending duplicate device-mode events. */
+	if (ddev->state.device_mode == mode) {
+		mutex_unlock(&ddev->write_lock);
+		return;
+	}
+
+	ddev->state.device_mode = mode;
+
+	event.e.length = sizeof(u16);
+	event.e.code = SDTX_EVENT_DEVICE_MODE;
+	event.v = mode;
+
+	sdtx_push_event(ddev, &event.e);
+
+	/* Send SW_TABLET_MODE event. */
+	tablet = mode != SDTX_DEVICE_MODE_LAPTOP;
+	input_report_switch(ddev->mode_switch, SW_TABLET_MODE, tablet);
+	input_sync(ddev->mode_switch);
+
+	mutex_unlock(&ddev->write_lock);
+}
+
+static void sdtx_update_device_mode(struct sdtx_device *ddev, unsigned long delay)
+{
+	schedule_delayed_work(&ddev->mode_work, delay);
+}
+
+/* Must be executed with ddev->write_lock held. */
+static void __sdtx_device_state_update_base(struct sdtx_device *ddev,
+					    struct ssam_bas_base_info info)
+{
+	struct sdtx_base_info_event event;
+
+	lockdep_assert_held(&ddev->write_lock);
+
+	/* Prevent duplicate events. */
+	if (ddev->state.base.state == info.state &&
+	    ddev->state.base.base_id == info.base_id)
+		return;
+
+	ddev->state.base = info;
+
+	event.e.length = sizeof(struct sdtx_base_info);
+	event.e.code = SDTX_EVENT_BASE_CONNECTION;
+	event.v.state = sdtx_translate_base_state(ddev, info.state);
+	event.v.base_id = SDTX_BASE_TYPE_SSH(info.base_id);
+
+	sdtx_push_event(ddev, &event.e);
+}
+
+/* Must be executed with ddev->write_lock held. */
+static void __sdtx_device_state_update_mode(struct sdtx_device *ddev, u8 mode)
+{
+	struct sdtx_status_event event;
+	int tablet;
+
+	/*
+	 * Note: This function must be called after updating the base state
+	 * via __sdtx_device_state_update_base(), as we rely on the updated
+	 * base state value in the validity check below.
+	 */
+
+	lockdep_assert_held(&ddev->write_lock);
+
+	if (sdtx_device_mode_invalid(mode, ddev->state.base.state)) {
+		dev_dbg(ddev->dev, "device mode is invalid, trying again\n");
+		sdtx_update_device_mode(ddev, SDTX_DEVICE_MODE_DELAY_RECHECK);
+		return;
+	}
+
+	/* Prevent duplicate events. */
+	if (ddev->state.device_mode == mode)
+		return;
+
+	ddev->state.device_mode = mode;
+
+	/* Send event. */
+	event.e.length = sizeof(u16);
+	event.e.code = SDTX_EVENT_DEVICE_MODE;
+	event.v = mode;
+
+	sdtx_push_event(ddev, &event.e);
+
+	/* Send SW_TABLET_MODE event. */
+	tablet = mode != SDTX_DEVICE_MODE_LAPTOP;
+	input_report_switch(ddev->mode_switch, SW_TABLET_MODE, tablet);
+	input_sync(ddev->mode_switch);
+}
+
+/* Must be executed with ddev->write_lock held. */
+static void __sdtx_device_state_update_latch(struct sdtx_device *ddev, u8 status)
+{
+	struct sdtx_status_event event;
+
+	lockdep_assert_held(&ddev->write_lock);
+
+	/* Prevent duplicate events. */
+	if (ddev->state.latch_status == status)
+		return;
+
+	ddev->state.latch_status = status;
+
+	event.e.length = sizeof(struct sdtx_base_info);
+	event.e.code = SDTX_EVENT_BASE_CONNECTION;
+	event.v = sdtx_translate_latch_status(ddev, status);
+
+	sdtx_push_event(ddev, &event.e);
+}
+
+static void sdtx_device_state_workfn(struct work_struct *work)
+{
+	struct sdtx_device *ddev = container_of(work, struct sdtx_device, state_work.work);
+	struct ssam_bas_base_info base;
+	u8 mode, latch;
+	int status;
+
+	/* Mark everything as dirty. */
+	set_bit(SDTX_DEVICE_DIRTY_BASE_BIT, &ddev->flags);
+	set_bit(SDTX_DEVICE_DIRTY_MODE_BIT, &ddev->flags);
+	set_bit(SDTX_DEVICE_DIRTY_LATCH_BIT, &ddev->flags);
+
+	/*
+	 * Ensure that the state gets marked as dirty before continuing to
+	 * query it. Necessary to ensure that clear_bit() calls in
+	 * sdtx_notifier() and sdtx_device_mode_workfn() actually clear these
+	 * bits if an event is received while updating the state here.
+	 */
+	smp_mb__after_atomic();
+
+	status = ssam_retry(ssam_bas_get_base, ddev->ctrl, &base);
+	if (status) {
+		dev_err(ddev->dev, "failed to get base state: %d\n", status);
+		return;
+	}
+
+	status = ssam_retry(ssam_bas_get_device_mode, ddev->ctrl, &mode);
+	if (status) {
+		dev_err(ddev->dev, "failed to get device mode: %d\n", status);
+		return;
+	}
+
+	status = ssam_retry(ssam_bas_get_latch_status, ddev->ctrl, &latch);
+	if (status) {
+		dev_err(ddev->dev, "failed to get latch status: %d\n", status);
+		return;
+	}
+
+	mutex_lock(&ddev->write_lock);
+
+	/*
+	 * If the respective dirty-bit has been cleared, an event has been
+	 * received, updating this state. The queried state may thus be out of
+	 * date. At this point, we can safely assume that the state provided
+	 * by the event is either up to date, or we're about to receive
+	 * another event updating it.
+	 */
+
+	if (test_and_clear_bit(SDTX_DEVICE_DIRTY_BASE_BIT, &ddev->flags))
+		__sdtx_device_state_update_base(ddev, base);
+
+	if (test_and_clear_bit(SDTX_DEVICE_DIRTY_MODE_BIT, &ddev->flags))
+		__sdtx_device_state_update_mode(ddev, mode);
+
+	if (test_and_clear_bit(SDTX_DEVICE_DIRTY_LATCH_BIT, &ddev->flags))
+		__sdtx_device_state_update_latch(ddev, latch);
+
+	mutex_unlock(&ddev->write_lock);
+}
+
+static void sdtx_update_device_state(struct sdtx_device *ddev, unsigned long delay)
+{
+	schedule_delayed_work(&ddev->state_work, delay);
+}
+
+
+/* -- Common device initialization. ----------------------------------------- */
+
+static int sdtx_device_init(struct sdtx_device *ddev, struct device *dev,
+			    struct ssam_controller *ctrl)
+{
+	int status, tablet_mode;
+
+	/* Basic initialization. */
+	kref_init(&ddev->kref);
+	init_rwsem(&ddev->lock);
+	ddev->dev = dev;
+	ddev->ctrl = ctrl;
+
+	ddev->mdev.minor = MISC_DYNAMIC_MINOR;
+	ddev->mdev.name = "surface_dtx";
+	ddev->mdev.nodename = "surface/dtx";
+	ddev->mdev.fops = &surface_dtx_fops;
+
+	ddev->notif.base.priority = 1;
+	ddev->notif.base.fn = sdtx_notifier;
+	ddev->notif.event.reg = SSAM_EVENT_REGISTRY_SAM;
+	ddev->notif.event.id.target_category = SSAM_SSH_TC_BAS;
+	ddev->notif.event.id.instance = 0;
+	ddev->notif.event.mask = SSAM_EVENT_MASK_NONE;
+	ddev->notif.event.flags = SSAM_EVENT_SEQUENCED;
+
+	init_waitqueue_head(&ddev->waitq);
+	mutex_init(&ddev->write_lock);
+	init_rwsem(&ddev->client_lock);
+	INIT_LIST_HEAD(&ddev->client_list);
+
+	INIT_DELAYED_WORK(&ddev->mode_work, sdtx_device_mode_workfn);
+	INIT_DELAYED_WORK(&ddev->state_work, sdtx_device_state_workfn);
+
+	/*
+	 * Get current device state. We want to guarantee that events are only
+	 * sent when state actually changes. Thus we cannot use special
+	 * "uninitialized" values, as that would cause problems when manually
+	 * querying the state in surface_dtx_pm_complete(). I.e. we would not
+	 * be able to detect state changes there if no change event has been
+	 * received between driver initialization and first device suspension.
+	 *
+	 * Note that we also need to do this before registering the event
+	 * notifier, as that may access the state values.
+	 */
+	status = ssam_retry(ssam_bas_get_base, ddev->ctrl, &ddev->state.base);
+	if (status)
+		return status;
+
+	status = ssam_retry(ssam_bas_get_device_mode, ddev->ctrl, &ddev->state.device_mode);
+	if (status)
+		return status;
+
+	status = ssam_retry(ssam_bas_get_latch_status, ddev->ctrl, &ddev->state.latch_status);
+	if (status)
+		return status;
+
+	/* Set up tablet mode switch. */
+	ddev->mode_switch = input_allocate_device();
+	if (!ddev->mode_switch)
+		return -ENOMEM;
+
+	ddev->mode_switch->name = "Microsoft Surface DTX Device Mode Switch";
+	ddev->mode_switch->phys = "ssam/01:11:01:00:00/input0";
+	ddev->mode_switch->id.bustype = BUS_HOST;
+	ddev->mode_switch->dev.parent = ddev->dev;
+
+	tablet_mode = (ddev->state.device_mode != SDTX_DEVICE_MODE_LAPTOP);
+	input_set_capability(ddev->mode_switch, EV_SW, SW_TABLET_MODE);
+	input_report_switch(ddev->mode_switch, SW_TABLET_MODE, tablet_mode);
+
+	status = input_register_device(ddev->mode_switch);
+	if (status) {
+		input_free_device(ddev->mode_switch);
+		return status;
+	}
+
+	/* Set up event notifier. */
+	status = ssam_notifier_register(ddev->ctrl, &ddev->notif);
+	if (status)
+		goto err_notif;
+
+	/* Register miscdevice. */
+	status = misc_register(&ddev->mdev);
+	if (status)
+		goto err_mdev;
+
+	/*
+	 * Update device state in case it has changed between getting the
+	 * initial mode and registering the event notifier.
+	 */
+	sdtx_update_device_state(ddev, 0);
+	return 0;
+
+err_notif:
+	ssam_notifier_unregister(ddev->ctrl, &ddev->notif);
+	cancel_delayed_work_sync(&ddev->mode_work);
+err_mdev:
+	input_unregister_device(ddev->mode_switch);
+	return status;
+}
+
+static struct sdtx_device *sdtx_device_create(struct device *dev, struct ssam_controller *ctrl)
+{
+	struct sdtx_device *ddev;
+	int status;
+
+	ddev = kzalloc(sizeof(*ddev), GFP_KERNEL);
+	if (!ddev)
+		return ERR_PTR(-ENOMEM);
+
+	status = sdtx_device_init(ddev, dev, ctrl);
+	if (status) {
+		sdtx_device_put(ddev);
+		return ERR_PTR(status);
+	}
+
+	return ddev;
+}
+
+static void sdtx_device_destroy(struct sdtx_device *ddev)
+{
+	struct sdtx_client *client;
+
+	/*
+	 * Mark device as shut-down. Prevent new clients from being added and
+	 * new operations from being executed.
+	 */
+	set_bit(SDTX_DEVICE_SHUTDOWN_BIT, &ddev->flags);
+
+	/* Disable notifiers, prevent new events from arriving. */
+	ssam_notifier_unregister(ddev->ctrl, &ddev->notif);
+
+	/* Stop mode_work, prevent access to mode_switch. */
+	cancel_delayed_work_sync(&ddev->mode_work);
+
+	/* Stop state_work. */
+	cancel_delayed_work_sync(&ddev->state_work);
+
+	/* With mode_work canceled, we can unregister the mode_switch. */
+	input_unregister_device(ddev->mode_switch);
+
+	/* Wake up async clients. */
+	down_write(&ddev->client_lock);
+	list_for_each_entry(client, &ddev->client_list, node) {
+		kill_fasync(&client->fasync, SIGIO, POLL_HUP);
+	}
+	up_write(&ddev->client_lock);
+
+	/* Wake up blocking clients. */
+	wake_up_interruptible(&ddev->waitq);
+
+	/*
+	 * Wait for clients to finish their current operation. After this, the
+	 * controller and device references are guaranteed to be no longer in
+	 * use.
+	 */
+	down_write(&ddev->lock);
+	ddev->dev = NULL;
+	ddev->ctrl = NULL;
+	up_write(&ddev->lock);
+
+	/* Finally remove the misc-device. */
+	misc_deregister(&ddev->mdev);
+
+	/*
+	 * We're now guaranteed that sdtx_device_open() won't be called any
+	 * more, so we can now drop out reference.
+	 */
+	sdtx_device_put(ddev);
+}
+
+
+/* -- PM ops. --------------------------------------------------------------- */
+
+#ifdef CONFIG_PM_SLEEP
+
+static void surface_dtx_pm_complete(struct device *dev)
+{
+	struct sdtx_device *ddev = dev_get_drvdata(dev);
+
+	/*
+	 * Normally, the EC will store events while suspended (i.e. in
+	 * display-off state) and release them when resumed (i.e. transitioned
+	 * to display-on state). During hibernation, however, the EC will be
+	 * shut down and does not store events. Furthermore, events might be
+	 * dropped during prolonged suspension (it is currently unknown how
+	 * big this event buffer is and how it behaves on overruns).
+	 *
+	 * To prevent any problems, we update the device state here. We do
+	 * this delayed to ensure that any events sent by the EC directly
+	 * after resuming will be handled first. The delay below has been
+	 * chosen (experimentally), so that there should be ample time for
+	 * these events to be handled, before we check and, if necessary,
+	 * update the state.
+	 */
+	sdtx_update_device_state(ddev, msecs_to_jiffies(1000));
+}
+
+static const struct dev_pm_ops surface_dtx_pm_ops = {
+	.complete = surface_dtx_pm_complete,
+};
+
+#else /* CONFIG_PM_SLEEP */
+
+static const struct dev_pm_ops surface_dtx_pm_ops = {};
+
+#endif /* CONFIG_PM_SLEEP */
+
+
+/* -- Platform driver. ------------------------------------------------------ */
+
+static int surface_dtx_platform_probe(struct platform_device *pdev)
+{
+	struct ssam_controller *ctrl;
+	struct sdtx_device *ddev;
+
+	/* Link to EC. */
+	ctrl = ssam_client_bind(&pdev->dev);
+	if (IS_ERR(ctrl))
+		return PTR_ERR(ctrl) == -ENODEV ? -EPROBE_DEFER : PTR_ERR(ctrl);
+
+	ddev = sdtx_device_create(&pdev->dev, ctrl);
+	if (IS_ERR(ddev))
+		return PTR_ERR(ddev);
+
+	platform_set_drvdata(pdev, ddev);
+	return 0;
+}
+
+static int surface_dtx_platform_remove(struct platform_device *pdev)
+{
+	sdtx_device_destroy(platform_get_drvdata(pdev));
+	return 0;
+}
+
+static const struct acpi_device_id surface_dtx_acpi_match[] = {
+	{ "MSHW0133", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, surface_dtx_acpi_match);
+
+static struct platform_driver surface_dtx_platform_driver = {
+	.probe = surface_dtx_platform_probe,
+	.remove = surface_dtx_platform_remove,
+	.driver = {
+		.name = "surface_dtx_pltf",
+		.acpi_match_table = surface_dtx_acpi_match,
+		.pm = &surface_dtx_pm_ops,
+		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+	},
+};
+module_platform_driver(surface_dtx_platform_driver);
+
+MODULE_AUTHOR("Maximilian Luz <luzmaximilian@gmail.com>");
+MODULE_DESCRIPTION("Detachment-system driver for Surface System Aggregator Module");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/surface_aggregator/dtx.h b/include/uapi/linux/surface_aggregator/dtx.h
new file mode 100644
index 000000000000..0833aab0d819
--- /dev/null
+++ b/include/uapi/linux/surface_aggregator/dtx.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * Surface DTX (clipboard detachment system driver) user-space interface.
+ *
+ * Definitions, structs, and IOCTLs for the /dev/surface/dtx misc device. This
+ * device allows user-space to control the clipboard detachment process on
+ * Surface Book series devices.
+ *
+ * Copyright (C) 2020-2021 Maximilian Luz <luzmaximilian@gmail.com>
+ */
+
+#ifndef _UAPI_LINUX_SURFACE_AGGREGATOR_DTX_H
+#define _UAPI_LINUX_SURFACE_AGGREGATOR_DTX_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/* Status/error categories */
+#define SDTX_CATEGORY_STATUS		0x0000
+#define SDTX_CATEGORY_RUNTIME_ERROR	0x1000
+#define SDTX_CATEGORY_HARDWARE_ERROR	0x2000
+#define SDTX_CATEGORY_UNKNOWN		0xf000
+
+#define SDTX_CATEGORY_MASK		0xf000
+#define SDTX_CATEGORY(value)		((value) & SDTX_CATEGORY_MASK)
+
+#define SDTX_STATUS(code)		((code) | SDTX_CATEGORY_STATUS)
+#define SDTX_ERR_RT(code)		((code) | SDTX_CATEGORY_RUNTIME_ERROR)
+#define SDTX_ERR_HW(code)		((code) | SDTX_CATEGORY_HARDWARE_ERROR)
+#define SDTX_UNKNOWN(code)		((code) | SDTX_CATEGORY_UNKNOWN)
+
+#define SDTX_SUCCESS(value)		(SDTX_CATEGORY(value) == SDTX_CATEGORY_STATUS)
+
+/* Latch status values */
+#define SDTX_LATCH_CLOSED		SDTX_STATUS(0x00)
+#define SDTX_LATCH_OPENED		SDTX_STATUS(0x01)
+
+/* Base state values */
+#define SDTX_BASE_DETACHED		SDTX_STATUS(0x00)
+#define SDTX_BASE_ATTACHED		SDTX_STATUS(0x01)
+
+/* Runtime errors (non-critical) */
+#define SDTX_DETACH_NOT_FEASIBLE	SDTX_ERR_RT(0x01)
+#define SDTX_DETACH_TIMEDOUT		SDTX_ERR_RT(0x02)
+
+/* Hardware errors (critical) */
+#define SDTX_ERR_FAILED_TO_OPEN		SDTX_ERR_HW(0x01)
+#define SDTX_ERR_FAILED_TO_REMAIN_OPEN	SDTX_ERR_HW(0x02)
+#define SDTX_ERR_FAILED_TO_CLOSE	SDTX_ERR_HW(0x03)
+
+/* Base types */
+#define SDTX_DEVICE_TYPE_HID		0x0100
+#define SDTX_DEVICE_TYPE_SSH		0x0200
+
+#define SDTX_DEVICE_TYPE_MASK		0x0f00
+#define SDTX_DEVICE_TYPE(value)		((value) & SDTX_DEVICE_TYPE_MASK)
+
+#define SDTX_BASE_TYPE_HID(id)		((id) | SDTX_DEVICE_TYPE_HID)
+#define SDTX_BASE_TYPE_SSH(id)		((id) | SDTX_DEVICE_TYPE_SSH)
+
+/**
+ * enum sdtx_device_mode - Mode describing how (and if) the clipboard is
+ * attached to the base of the device.
+ * @SDTX_DEVICE_MODE_TABLET: The clipboard is detached from the base and the
+ *                           device operates as tablet.
+ * @SDTX_DEVICE_MODE_LAPTOP: The clipboard is attached normally to the base
+ *                           and the device operates as laptop.
+ * @SDTX_DEVICE_MODE_STUDIO: The clipboard is attached to the base in reverse.
+ *                           The device operates as tablet with keyboard and
+ *                           touchpad deactivated, however, the base battery
+ *                           and, if present in the specific device model, dGPU
+ *                           are available to the system.
+ */
+enum sdtx_device_mode {
+	SDTX_DEVICE_MODE_TABLET		= 0x00,
+	SDTX_DEVICE_MODE_LAPTOP		= 0x01,
+	SDTX_DEVICE_MODE_STUDIO		= 0x02,
+};
+
+/**
+ * struct sdtx_event - Event provided by reading from the DTX device file.
+ * @length: Length of the event payload, in bytes.
+ * @code:   Event code, detailing what type of event this is.
+ * @data:   Payload of the event, containing @length bytes.
+ *
+ * See &enum sdtx_event_code for currently valid event codes.
+ */
+struct sdtx_event {
+	__u16 length;
+	__u16 code;
+	__u8 data[];
+} __attribute__((__packed__));
+
+/**
+ * enum sdtx_event_code - Code describing the type of an event.
+ * @SDTX_EVENT_REQUEST:         Detachment request event type.
+ * @SDTX_EVENT_CANCEL:          Cancel detachment process event type.
+ * @SDTX_EVENT_BASE_CONNECTION: Base/clipboard connection change event type.
+ * @SDTX_EVENT_LATCH_STATUS:    Latch status change event type.
+ * @SDTX_EVENT_DEVICE_MODE:     Device mode change event type.
+ *
+ * Used in &struct sdtx_event to describe the type of the event. Further event
+ * codes are reserved for future use. Any event parser should be able to
+ * gracefully handle unknown events, i.e. by simply skipping them.
+ *
+ * Consult the DTX user-space interface documentation for details regarding
+ * the individual event types.
+ */
+enum sdtx_event_code {
+	SDTX_EVENT_REQUEST		= 1,
+	SDTX_EVENT_CANCEL		= 2,
+	SDTX_EVENT_BASE_CONNECTION	= 3,
+	SDTX_EVENT_LATCH_STATUS		= 4,
+	SDTX_EVENT_DEVICE_MODE		= 5,
+};
+
+/**
+ * struct sdtx_base_info - Describes if and what type of base is connected.
+ * @state:   The state of the connection. Valid values are %SDTX_BASE_DETACHED,
+ *           %SDTX_BASE_ATTACHED, and %SDTX_DETACH_NOT_FEASIBLE (in case a base
+ *           is attached but low clipboard battery prevents detachment). Other
+ *           values are currently reserved.
+ * @base_id: The type of base connected. Zero if no base is connected.
+ */
+struct sdtx_base_info {
+	__u16 state;
+	__u16 base_id;
+} __attribute__((__packed__));
+
+/* IOCTLs */
+#define SDTX_IOCTL_EVENTS_ENABLE	_IO(0xa5, 0x21)
+#define SDTX_IOCTL_EVENTS_DISABLE	_IO(0xa5, 0x22)
+
+#define SDTX_IOCTL_LATCH_LOCK		_IO(0xa5, 0x23)
+#define SDTX_IOCTL_LATCH_UNLOCK		_IO(0xa5, 0x24)
+
+#define SDTX_IOCTL_LATCH_REQUEST	_IO(0xa5, 0x25)
+#define SDTX_IOCTL_LATCH_CONFIRM	_IO(0xa5, 0x26)
+#define SDTX_IOCTL_LATCH_HEARTBEAT	_IO(0xa5, 0x27)
+#define SDTX_IOCTL_LATCH_CANCEL		_IO(0xa5, 0x28)
+
+#define SDTX_IOCTL_GET_BASE_INFO	_IOR(0xa5, 0x29, struct sdtx_base_info)
+#define SDTX_IOCTL_GET_DEVICE_MODE	_IOR(0xa5, 0x2a, __u16)
+#define SDTX_IOCTL_GET_LATCH_STATUS	_IOR(0xa5, 0x2b, __u16)
+
+#endif /* _UAPI_LINUX_SURFACE_AGGREGATOR_DTX_H */
-- 
cgit v1.2.3-71-gd317


From 3093c3c7c136458af692d5c3d309a66c3c12d9f4 Mon Sep 17 00:00:00 2001
From: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Date: Thu, 11 Mar 2021 15:04:09 +0100
Subject: rpmsg: Move RPMSG_ADDR_ANY in user API

As the RPMSG_ADDR_ANY is a valid src or dst address that can be set by
user applications, migrate its definition in user API.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Link: https://lore.kernel.org/r/20210311140413.31725-3-arnaud.pouliquen@foss.st.com
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/rpmsg.h      | 3 +--
 include/uapi/linux/rpmsg.h | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index a5db828b2420..d97dcd049f18 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -18,8 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/poll.h>
 #include <linux/rpmsg/byteorder.h>
-
-#define RPMSG_ADDR_ANY		0xFFFFFFFF
+#include <uapi/linux/rpmsg.h>
 
 struct rpmsg_device;
 struct rpmsg_endpoint;
diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h
index e14c6dab4223..5e00748da319 100644
--- a/include/uapi/linux/rpmsg.h
+++ b/include/uapi/linux/rpmsg.h
@@ -9,6 +9,8 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
+#define RPMSG_ADDR_ANY		0xFFFFFFFF
+
 /**
  * struct rpmsg_endpoint_info - endpoint info representation
  * @name: name of service
-- 
cgit v1.2.3-71-gd317


From 809328b40cfb152f75541aa3dcbbe4903098963b Mon Sep 17 00:00:00 2001
From: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Date: Thu, 11 Mar 2021 15:04:10 +0100
Subject: rpmsg: Add short description of the IOCTL defined in UAPI.

Add a description of the IOCTLs and provide information on the default
value of the source and destination addresses.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Link: https://lore.kernel.org/r/20210311140413.31725-4-arnaud.pouliquen@foss.st.com
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/uapi/linux/rpmsg.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h
index 5e00748da319..f5ca8740f3fb 100644
--- a/include/uapi/linux/rpmsg.h
+++ b/include/uapi/linux/rpmsg.h
@@ -14,8 +14,8 @@
 /**
  * struct rpmsg_endpoint_info - endpoint info representation
  * @name: name of service
- * @src: local address
- * @dst: destination address
+ * @src: local address. To set to RPMSG_ADDR_ANY if not used.
+ * @dst: destination address. To set to RPMSG_ADDR_ANY if not used.
  */
 struct rpmsg_endpoint_info {
 	char name[32];
@@ -23,7 +23,14 @@ struct rpmsg_endpoint_info {
 	__u32 dst;
 };
 
+/**
+ * Instantiate a new rmpsg char device endpoint.
+ */
 #define RPMSG_CREATE_EPT_IOCTL	_IOW(0xb5, 0x1, struct rpmsg_endpoint_info)
+
+/**
+ * Destroy a rpmsg char device endpoint created by the RPMSG_CREATE_EPT_IOCTL.
+ */
 #define RPMSG_DESTROY_EPT_IOCTL	_IO(0xb5, 0x2)
 
 #endif
-- 
cgit v1.2.3-71-gd317


From 6a154ec9ef6762c774cd2b50215c7a8f0f08a862 Mon Sep 17 00:00:00 2001
From: Pawel Laszczak <pawell@cadence.com>
Date: Mon, 15 Mar 2021 08:17:48 +0100
Subject: usb: webcam: Invalid size of Processing Unit Descriptor

According with USB Device Class Definition for Video Device the
Processing Unit Descriptor bLength should be 12 (10 + bmControlSize),
but it has 11.

Invalid length caused that Processing Unit Descriptor Test Video form
CV tool failed. To fix this issue patch adds bmVideoStandards into
uvc_processing_unit_descriptor structure.

The bmVideoStandards field was added in UVC 1.1 and it wasn't part of
UVC 1.0a.

Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Pawel Laszczak <pawell@cadence.com>
Reviewed-by: Peter Chen <peter.chen@kernel.org>
Link: https://lore.kernel.org/r/20210315071748.29706-1-pawell@gli-login.cadence.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/function/f_uvc.c | 1 +
 drivers/usb/gadget/legacy/webcam.c  | 1 +
 include/uapi/linux/usb/video.h      | 3 ++-
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/gadget/function/f_uvc.c b/drivers/usb/gadget/function/f_uvc.c
index ed77a126a74f..f48a00e49794 100644
--- a/drivers/usb/gadget/function/f_uvc.c
+++ b/drivers/usb/gadget/function/f_uvc.c
@@ -822,6 +822,7 @@ static struct usb_function_instance *uvc_alloc_inst(void)
 	pd->bmControls[0]		= 1;
 	pd->bmControls[1]		= 0;
 	pd->iProcessing			= 0;
+	pd->bmVideoStandards		= 0;
 
 	od = &opts->uvc_output_terminal;
 	od->bLength			= UVC_DT_OUTPUT_TERMINAL_SIZE;
diff --git a/drivers/usb/gadget/legacy/webcam.c b/drivers/usb/gadget/legacy/webcam.c
index 3a61de4bb2b1..94e22867da1d 100644
--- a/drivers/usb/gadget/legacy/webcam.c
+++ b/drivers/usb/gadget/legacy/webcam.c
@@ -125,6 +125,7 @@ static const struct uvc_processing_unit_descriptor uvc_processing = {
 	.bmControls[0]		= 1,
 	.bmControls[1]		= 0,
 	.iProcessing		= 0,
+	.bmVideoStandards	= 0,
 };
 
 static const struct uvc_output_terminal_descriptor uvc_output_terminal = {
diff --git a/include/uapi/linux/usb/video.h b/include/uapi/linux/usb/video.h
index d854cb19c42c..bfdae12cdacf 100644
--- a/include/uapi/linux/usb/video.h
+++ b/include/uapi/linux/usb/video.h
@@ -302,9 +302,10 @@ struct uvc_processing_unit_descriptor {
 	__u8   bControlSize;
 	__u8   bmControls[2];
 	__u8   iProcessing;
+	__u8   bmVideoStandards;
 } __attribute__((__packed__));
 
-#define UVC_DT_PROCESSING_UNIT_SIZE(n)			(9+(n))
+#define UVC_DT_PROCESSING_UNIT_SIZE(n)			(10+(n))
 
 /* 3.7.2.6. Extension Unit Descriptor */
 struct uvc_extension_unit_descriptor {
-- 
cgit v1.2.3-71-gd317


From bb0f61533dfd6aa815a2719720c77d13f840b683 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Tue, 2 Mar 2021 02:13:58 -0800
Subject: iommu/vt-d: Enable write protect propagation from guest

Write protect bit, when set, inhibits supervisor writes to the read-only
pages. In guest supervisor shared virtual addressing (SVA), write-protect
should be honored upon guest bind supervisor PASID request.

This patch extends the VT-d portion of the IOMMU UAPI to include WP bit.
WPE bit of the  supervisor PASID entry will be set to match CPU CR0.WP bit.

Signed-off-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1614680040-1989-3-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/pasid.c | 3 +++
 include/uapi/linux/iommu.h  | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 629e3aee44fe..0bf7e0a76890 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -732,6 +732,9 @@ intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte,
 			return -EINVAL;
 		}
 		pasid_set_sre(pte);
+		/* Enable write protect WP if guest requested */
+		if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_WPE)
+			pasid_set_wpe(pte);
 	}
 
 	if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index e1d9e75f2c94..59178fc229ca 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -288,7 +288,8 @@ struct iommu_gpasid_bind_data_vtd {
 #define IOMMU_SVA_VTD_GPASID_PWT	(1 << 3) /* page-level write through */
 #define IOMMU_SVA_VTD_GPASID_EMTE	(1 << 4) /* extended mem type enable */
 #define IOMMU_SVA_VTD_GPASID_CD		(1 << 5) /* PASID-level cache disable */
-#define IOMMU_SVA_VTD_GPASID_LAST	(1 << 6)
+#define IOMMU_SVA_VTD_GPASID_WPE	(1 << 6) /* Write protect enable */
+#define IOMMU_SVA_VTD_GPASID_LAST	(1 << 7)
 	__u64 flags;
 	__u32 pat;
 	__u32 emt;
-- 
cgit v1.2.3-71-gd317


From f998d7d545a2248faf5a4311240941dfe813eedc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Wed, 17 Mar 2021 21:17:36 -0700
Subject: quota: report warning limits for realtime space quotas

Report the number of warnings that a user will get for exceeding the
soft limit of a realtime volume.  This plugs a gap needed before we
can land a realtime quota implementation for XFS in the next cycle.

Link: https://lore.kernel.org/r/20210318041736.GB22094@magnolia
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota.c               | 1 +
 include/uapi/linux/dqblk_xfs.h | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index f7b4b66491fc..05e4bd9ab6d6 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -472,6 +472,7 @@ static int quota_getstatev(struct super_block *sb, int type,
 	fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
 	fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
 	fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;
+	fqs->qs_rtbwarnlimit = state.s_state[type].rt_spc_warnlimit;
 
 	/* Inodes may be allocated even if inactive; copy out if present */
 	if (state.s_state[USRQUOTA].ino) {
diff --git a/include/uapi/linux/dqblk_xfs.h b/include/uapi/linux/dqblk_xfs.h
index c71d909addda..8cda3e62e0e7 100644
--- a/include/uapi/linux/dqblk_xfs.h
+++ b/include/uapi/linux/dqblk_xfs.h
@@ -219,7 +219,10 @@ struct fs_quota_statv {
 	__s32			qs_rtbtimelimit;/* limit for rt blks timer */
 	__u16			qs_bwarnlimit;	/* limit for num warnings */
 	__u16			qs_iwarnlimit;	/* limit for num warnings */
-	__u64			qs_pad2[8];	/* for future proofing */
+	__u16			qs_rtbwarnlimit;/* limit for rt blks warnings */
+	__u16			qs_pad3;
+	__u32			qs_pad4;
+	__u64			qs_pad2[7];	/* for future proofing */
 };
 
 #endif	/* _LINUX_DQBLK_XFS_H */
-- 
cgit v1.2.3-71-gd317


From 9f3d1056ea54170bfcc678ca69f802492cc69f93 Mon Sep 17 00:00:00 2001
From: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Date: Mon, 9 Nov 2020 18:35:38 +0100
Subject: media: v4l2-ctrl: Make display delay and display enable std controls

Make display delay and display delay enable MFC controls standard v4l
controls. This will allow reuse of the controls for other decoder
drivers. Also the new proposed controls are now codec agnostic because
they could be used for any codec.

Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst | 15 +++++++++++++++
 drivers/media/v4l2-core/v4l2-ctrls.c                      |  4 ++++
 include/uapi/linux/v4l2-controls.h                        |  3 +++
 3 files changed, 22 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 7ee8ae073b91..7e8ab168571d 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -679,6 +679,21 @@ enum v4l2_mpeg_video_frame_skip_mode -
     otherwise the decoder expects a single frame in per buffer.
     Applicable to the decoder, all codecs.
 
+``V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE (boolean)``
+    If the display delay is enabled then the decoder is forced to return
+    a CAPTURE buffer (decoded frame) after processing a certain number
+    of OUTPUT buffers. The delay can be set through
+    ``V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY``. This
+    feature can be used for example for generating thumbnails of videos.
+    Applicable to the decoder.
+
+``V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY (integer)``
+    Display delay value for decoder. The decoder is forced to
+    return a decoded frame after the set 'display delay' number of
+    frames. If this number is low it may result in frames returned out
+    of display order, in addition the hardware may still be using the
+    returned buffer as a reference picture for subsequent frames.
+
 ``V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_ENABLE (boolean)``
     Enable writing sample aspect ratio in the Video Usability
     Information. Applicable to the H264 encoder.
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 016cf6204cbb..524e7abeba89 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -874,6 +874,8 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_HEADER_MODE:			return "Sequence Header Mode";
 	case V4L2_CID_MPEG_VIDEO_MAX_REF_PIC:			return "Max Number of Reference Pics";
 	case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE:		return "Frame Skip Mode";
+	case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY:		return "Display Delay";
+	case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE:	return "Display Delay Enable";
 	case V4L2_CID_MPEG_VIDEO_H263_I_FRAME_QP:		return "H263 I-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_H263_P_FRAME_QP:		return "H263 P-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_H263_B_FRAME_QP:		return "H263 B-Frame QP Value";
@@ -1241,6 +1243,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_FLASH_READY:
 	case V4L2_CID_MPEG_VIDEO_DECODER_MPEG4_DEBLOCK_FILTER:
 	case V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE:
+	case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE:
 	case V4L2_CID_MPEG_VIDEO_FRAME_RC_ENABLE:
 	case V4L2_CID_MPEG_VIDEO_MB_RC_ENABLE:
 	case V4L2_CID_MPEG_VIDEO_H264_8X8_TRANSFORM:
@@ -1276,6 +1279,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 		break;
 	case V4L2_CID_MPEG_VIDEO_MV_H_SEARCH_RANGE:
 	case V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE:
+	case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY:
 		*type = V4L2_CTRL_TYPE_INTEGER;
 		break;
 	case V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME:
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 61441c250239..987fbd946e4c 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -797,6 +797,9 @@ enum v4l2_mpeg_video_frame_skip_mode {
 #define V4L2_CID_MPEG_VIDEO_HEVC_B_FRAME_MIN_QP        (V4L2_CID_CODEC_BASE + 651)
 #define V4L2_CID_MPEG_VIDEO_HEVC_B_FRAME_MAX_QP        (V4L2_CID_CODEC_BASE + 652)
 
+#define V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY		(V4L2_CID_CODEC_BASE + 653)
+#define V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE	(V4L2_CID_CODEC_BASE + 654)
+
 /*  MPEG-class control IDs specific to the CX2341x driver as defined by V4L2 */
 #define V4L2_CID_CODEC_CX2341X_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1000)
 #define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE		(V4L2_CID_CODEC_CX2341X_BASE+0)
-- 
cgit v1.2.3-71-gd317


From f2bf1bcb191b15c0ac135af2651ebed6c017bcb0 Mon Sep 17 00:00:00 2001
From: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Date: Tue, 24 Nov 2020 01:01:37 +0100
Subject: media: v4l2-ctrls: Add control for AUD generation

Add a control to enable inserting of AUD NALU into encoded
bitstream.

Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst | 5 +++++
 drivers/media/v4l2-core/v4l2-ctrls.c                      | 2 ++
 include/uapi/linux/v4l2-controls.h                        | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 86b23caf4be1..9963c1c9a4d5 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -694,6 +694,11 @@ enum v4l2_mpeg_video_frame_skip_mode -
     of display order, in addition the hardware may still be using the
     returned buffer as a reference picture for subsequent frames.
 
+``V4L2_CID_MPEG_VIDEO_AU_DELIMITER (boolean)``
+    If enabled then, AUD (Access Unit Delimiter) NALUs will be generated.
+    That could be useful to find the start of a frame without having to
+    fully parse each NALU. Applicable to the H264 and HEVC encoders.
+
 ``V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_ENABLE (boolean)``
     Enable writing sample aspect ratio in the Video Usability
     Information. Applicable to the H264 encoder.
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 524e7abeba89..ca50e21e2838 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -876,6 +876,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE:		return "Frame Skip Mode";
 	case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY:		return "Display Delay";
 	case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE:	return "Display Delay Enable";
+	case V4L2_CID_MPEG_VIDEO_AU_DELIMITER:			return "Generate Access Unit Delimiters";
 	case V4L2_CID_MPEG_VIDEO_H263_I_FRAME_QP:		return "H263 I-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_H263_P_FRAME_QP:		return "H263 P-Frame QP Value";
 	case V4L2_CID_MPEG_VIDEO_H263_B_FRAME_QP:		return "H263 B-Frame QP Value";
@@ -1250,6 +1251,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_ENABLE:
 	case V4L2_CID_MPEG_VIDEO_MPEG4_QPEL:
 	case V4L2_CID_MPEG_VIDEO_REPEAT_SEQ_HEADER:
+	case V4L2_CID_MPEG_VIDEO_AU_DELIMITER:
 	case V4L2_CID_WIDE_DYNAMIC_RANGE:
 	case V4L2_CID_IMAGE_STABILIZATION:
 	case V4L2_CID_RDS_RECEPTION:
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 987fbd946e4c..6f8c08507bf4 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -428,6 +428,7 @@ enum v4l2_mpeg_video_multi_slice_mode {
 #define V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE		(V4L2_CID_CODEC_BASE+228)
 #define V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME		(V4L2_CID_CODEC_BASE+229)
 #define V4L2_CID_MPEG_VIDEO_BASELAYER_PRIORITY_ID	(V4L2_CID_CODEC_BASE+230)
+#define V4L2_CID_MPEG_VIDEO_AU_DELIMITER		(V4L2_CID_CODEC_BASE+231)
 
 /* CIDs for the MPEG-2 Part 2 (H.262) codec */
 #define V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL			(V4L2_CID_CODEC_BASE+270)
-- 
cgit v1.2.3-71-gd317


From 1fb03333eb926e315f9850098b5a3361832f07cb Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Thu, 4 Mar 2021 15:07:51 +0100
Subject: media: uapi: Move parsed VP8 pixel format out of staging

Since we are ready to stabilize the VP8 stateless API,
move the parsed VP8 pixel format.

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 Documentation/userspace-api/media/v4l/pixfmt-compressed.rst | 5 -----
 include/media/vp8-ctrls.h                                   | 2 --
 include/uapi/linux/videodev2.h                              | 1 +
 3 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
index 7341c9b4516b..0232d0808519 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
@@ -164,11 +164,6 @@ Compressed Formats
 	of macroblocks to decode a full corresponding frame to the matching
 	capture buffer.
 
-	.. note::
-
-	   This format is not yet part of the public kernel API and it
-	   is expected to change.
-
     * .. _V4L2-PIX-FMT-VP9:
 
       - ``V4L2_PIX_FMT_VP9``
diff --git a/include/media/vp8-ctrls.h b/include/media/vp8-ctrls.h
index ab46d775788b..50c92c5ed0eb 100644
--- a/include/media/vp8-ctrls.h
+++ b/include/media/vp8-ctrls.h
@@ -13,8 +13,6 @@
 
 #include <linux/types.h>
 
-#define V4L2_PIX_FMT_VP8_FRAME v4l2_fourcc('V', 'P', '8', 'F')
-
 #define V4L2_CID_MPEG_VIDEO_VP8_FRAME (V4L2_CID_CODEC_BASE + 2000)
 #define V4L2_CTRL_TYPE_VP8_FRAME 0x301
 
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 79dbde3bcf8d..a1d903c6f9f0 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -694,6 +694,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_VC1_ANNEX_G v4l2_fourcc('V', 'C', '1', 'G') /* SMPTE 421M Annex G compliant stream */
 #define V4L2_PIX_FMT_VC1_ANNEX_L v4l2_fourcc('V', 'C', '1', 'L') /* SMPTE 421M Annex L compliant stream */
 #define V4L2_PIX_FMT_VP8      v4l2_fourcc('V', 'P', '8', '0') /* VP8 */
+#define V4L2_PIX_FMT_VP8_FRAME v4l2_fourcc('V', 'P', '8', 'F') /* VP8 parsed frame */
 #define V4L2_PIX_FMT_VP9      v4l2_fourcc('V', 'P', '9', '0') /* VP9 */
 #define V4L2_PIX_FMT_HEVC     v4l2_fourcc('H', 'E', 'V', 'C') /* HEVC aka H.265 */
 #define V4L2_PIX_FMT_FWHT     v4l2_fourcc('F', 'W', 'H', 'T') /* Fast Walsh Hadamard Transform (vicodec) */
-- 
cgit v1.2.3-71-gd317


From e74b504a87c110071376438564a6f7a351a215bf Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Thu, 4 Mar 2021 15:07:52 +0100
Subject: media: uapi: Move the VP8 stateless control type out of staging

Move the VP8 stateless control types out of staging,
and re-number it to avoid any confusion.

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/media/vp8-ctrls.h      | 1 -
 include/uapi/linux/videodev2.h | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/media/vp8-ctrls.h b/include/media/vp8-ctrls.h
index 50c92c5ed0eb..409a87f9e609 100644
--- a/include/media/vp8-ctrls.h
+++ b/include/media/vp8-ctrls.h
@@ -14,7 +14,6 @@
 #include <linux/types.h>
 
 #define V4L2_CID_MPEG_VIDEO_VP8_FRAME (V4L2_CID_CODEC_BASE + 2000)
-#define V4L2_CTRL_TYPE_VP8_FRAME 0x301
 
 #define V4L2_VP8_SEGMENT_FLAG_ENABLED              0x01
 #define V4L2_VP8_SEGMENT_FLAG_UPDATE_MAP           0x02
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index a1d903c6f9f0..611b75df7f17 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1793,6 +1793,8 @@ enum v4l2_ctrl_type {
 	V4L2_CTRL_TYPE_H264_PRED_WEIGHTS    = 0x0205,
 
 	V4L2_CTRL_TYPE_FWHT_PARAMS	    = 0x0220,
+
+	V4L2_CTRL_TYPE_VP8_FRAME            = 0x0240,
 };
 
 /*  Used in the VIDIOC_QUERYCTRL ioctl for querying controls */
-- 
cgit v1.2.3-71-gd317


From 363240ce1c08875815d28276f0a793bcaedb1ee9 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Thu, 4 Mar 2021 15:07:55 +0100
Subject: media: uapi: move VP8 stateless controls out of staging

Until now, the VP8 V4L2 API was not exported as a public API,
and only defined in a private media header (media/vp8-ctrls.h).

The reason for this was a concern about the API not complete
and ready to support VP8 decoding hardware accelerators.

After reviewing the VP8 specification in detail, and now
that the API is able to support Cedrus and Hantro G1,
we can consider this ready.

Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 .../media/v4l/ext-ctrls-codec-stateless.rst        | 334 ++++++++++++++++++++
 .../userspace-api/media/v4l/ext-ctrls-codec.rst    | 339 ---------------------
 .../userspace-api/media/v4l/pixfmt-compressed.rst  |  10 +-
 .../userspace-api/media/v4l/vidioc-g-ext-ctrls.rst |   4 +
 .../userspace-api/media/v4l/vidioc-queryctrl.rst   |   6 +
 .../userspace-api/media/videodev2.h.rst.exceptions |   1 +
 drivers/media/v4l2-core/v4l2-ctrls.c               |   4 +-
 drivers/staging/media/hantro/hantro_drv.c          |   2 +-
 drivers/staging/media/hantro/hantro_g1_vp8_dec.c   |   3 +-
 .../staging/media/hantro/rk3399_vpu_hw_vp8_dec.c   |   3 +-
 drivers/staging/media/sunxi/cedrus/cedrus.c        |   2 +-
 drivers/staging/media/sunxi/cedrus/cedrus_dec.c    |   2 +-
 drivers/staging/media/sunxi/cedrus/cedrus_vp8.c    |   3 +-
 include/media/v4l2-ctrls.h                         |   1 -
 include/media/vp8-ctrls.h                          | 210 -------------
 include/uapi/linux/v4l2-controls.h                 | 195 ++++++++++++
 include/uapi/linux/videodev2.h                     |   1 +
 17 files changed, 555 insertions(+), 565 deletions(-)
 delete mode 100644 include/media/vp8-ctrls.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
index 90c0e1d57544..3fc04daa9ffb 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
@@ -910,3 +910,337 @@ FWHT Flags
 .. raw:: latex
 
     \normalsize
+
+.. _v4l2-codec-stateless-vp8:
+
+``V4L2_CID_STATELESS_VP8_FRAME (struct)``
+    Specifies the frame parameters for the associated VP8 parsed frame data.
+    This includes the necessary parameters for
+    configuring a stateless hardware decoding pipeline for VP8.
+    The bitstream parameters are defined according to :ref:`vp8`.
+
+.. c:type:: v4l2_ctrl_vp8_frame
+
+.. raw:: latex
+
+    \small
+
+.. tabularcolumns:: |p{7.0cm}|p{4.6cm}|p{5.7cm}|
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_vp8_frame
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - struct :c:type:`v4l2_vp8_segment`
+      - ``segment``
+      - Structure with segment-based adjustments metadata.
+    * - struct :c:type:`v4l2_vp8_loop_filter`
+      - ``lf``
+      - Structure with loop filter level adjustments metadata.
+    * - struct :c:type:`v4l2_vp8_quantization`
+      - ``quant``
+      - Structure with VP8 dequantization indices metadata.
+    * - struct :c:type:`v4l2_vp8_entropy`
+      - ``entropy``
+      - Structure with VP8 entropy coder probabilities metadata.
+    * - struct :c:type:`v4l2_vp8_entropy_coder_state`
+      - ``coder_state``
+      - Structure with VP8 entropy coder state.
+    * - __u16
+      - ``width``
+      - The width of the frame. Must be set for all frames.
+    * - __u16
+      - ``height``
+      - The height of the frame. Must be set for all frames.
+    * - __u8
+      - ``horizontal_scale``
+      - Horizontal scaling factor.
+    * - __u8
+      - ``vertical_scaling factor``
+      - Vertical scale.
+    * - __u8
+      - ``version``
+      - Bitstream version.
+    * - __u8
+      - ``prob_skip_false``
+      - Indicates the probability that the macroblock is not skipped.
+    * - __u8
+      - ``prob_intra``
+      - Indicates the probability that a macroblock is intra-predicted.
+    * - __u8
+      - ``prob_last``
+      - Indicates the probability that the last reference frame is used
+        for inter-prediction
+    * - __u8
+      - ``prob_gf``
+      - Indicates the probability that the golden reference frame is used
+        for inter-prediction
+    * - __u8
+      - ``num_dct_parts``
+      - Number of DCT coefficients partitions. Must be one of: 1, 2, 4, or 8.
+    * - __u32
+      - ``first_part_size``
+      - Size of the first partition, i.e. the control partition.
+    * - __u32
+      - ``first_part_header_bits``
+      - Size in bits of the first partition header portion.
+    * - __u32
+      - ``dct_part_sizes[8]``
+      - DCT coefficients sizes.
+    * - __u64
+      - ``last_frame_ts``
+      - Timestamp for the V4L2 capture buffer to use as last reference frame, used
+        with inter-coded frames. The timestamp refers to the ``timestamp`` field in
+	struct :c:type:`v4l2_buffer`. Use the :c:func:`v4l2_timeval_to_ns()`
+	function to convert the struct :c:type:`timeval` in struct
+	:c:type:`v4l2_buffer` to a __u64.
+    * - __u64
+      - ``golden_frame_ts``
+      - Timestamp for the V4L2 capture buffer to use as last reference frame, used
+        with inter-coded frames. The timestamp refers to the ``timestamp`` field in
+	struct :c:type:`v4l2_buffer`. Use the :c:func:`v4l2_timeval_to_ns()`
+	function to convert the struct :c:type:`timeval` in struct
+	:c:type:`v4l2_buffer` to a __u64.
+    * - __u64
+      - ``alt_frame_ts``
+      - Timestamp for the V4L2 capture buffer to use as alternate reference frame, used
+        with inter-coded frames. The timestamp refers to the ``timestamp`` field in
+	struct :c:type:`v4l2_buffer`. Use the :c:func:`v4l2_timeval_to_ns()`
+	function to convert the struct :c:type:`timeval` in struct
+	:c:type:`v4l2_buffer` to a __u64.
+    * - __u64
+      - ``flags``
+      - See :ref:`Frame Flags <vp8_frame_flags>`
+
+.. raw:: latex
+
+    \normalsize
+
+.. _vp8_frame_flags:
+
+``Frame Flags``
+
+.. tabularcolumns:: |p{9.8cm}|p{0.8cm}|p{6.7cm}|
+
+.. cssclass:: longtable
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_VP8_FRAME_FLAG_KEY_FRAME``
+      - 0x01
+      - Indicates if the frame is a key frame.
+    * - ``V4L2_VP8_FRAME_FLAG_EXPERIMENTAL``
+      - 0x02
+      - Experimental bitstream.
+    * - ``V4L2_VP8_FRAME_FLAG_SHOW_FRAME``
+      - 0x04
+      - Show frame flag, indicates if the frame is for display.
+    * - ``V4L2_VP8_FRAME_FLAG_MB_NO_SKIP_COEFF``
+      - 0x08
+      - Enable/disable skipping of macroblocks with no non-zero coefficients.
+    * - ``V4L2_VP8_FRAME_FLAG_SIGN_BIAS_GOLDEN``
+      - 0x10
+      - Sign of motion vectors when the golden frame is referenced.
+    * - ``V4L2_VP8_FRAME_FLAG_SIGN_BIAS_ALT``
+      - 0x20
+      - Sign of motion vectors when the alt frame is referenced.
+
+.. c:type:: v4l2_vp8_entropy_coder_state
+
+.. cssclass:: longtable
+
+.. tabularcolumns:: |p{1.0cm}|p{2.0cm}|p{14.3cm}|
+
+.. flat-table:: struct v4l2_vp8_entropy_coder_state
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``range``
+      - coder state value for "Range"
+    * - __u8
+      - ``value``
+      - coder state value for "Value"-
+    * - __u8
+      - ``bit_count``
+      - number of bits left.
+    * - __u8
+      - ``padding``
+      - Applications and drivers must set this to zero.
+
+.. c:type:: v4l2_vp8_segment
+
+.. cssclass:: longtable
+
+.. tabularcolumns:: |p{1.2cm}|p{4.0cm}|p{12.1cm}|
+
+.. flat-table:: struct v4l2_vp8_segment
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __s8
+      - ``quant_update[4]``
+      - Signed quantizer value update.
+    * - __s8
+      - ``lf_update[4]``
+      - Signed loop filter level value update.
+    * - __u8
+      - ``segment_probs[3]``
+      - Segment probabilities.
+    * - __u8
+      - ``padding``
+      - Applications and drivers must set this to zero.
+    * - __u32
+      - ``flags``
+      - See :ref:`Segment Flags <vp8_segment_flags>`
+
+.. _vp8_segment_flags:
+
+``Segment Flags``
+
+.. raw:: latex
+
+    \small
+
+.. tabularcolumns:: |p{10cm}|p{1.0cm}|p{6.3cm}|
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_VP8_SEGMENT_FLAG_ENABLED``
+      - 0x01
+      - Enable/disable segment-based adjustments.
+    * - ``V4L2_VP8_SEGMENT_FLAG_UPDATE_MAP``
+      - 0x02
+      - Indicates if the macroblock segmentation map is updated in this frame.
+    * - ``V4L2_VP8_SEGMENT_FLAG_UPDATE_FEATURE_DATA``
+      - 0x04
+      - Indicates if the segment feature data is updated in this frame.
+    * - ``V4L2_VP8_SEGMENT_FLAG_DELTA_VALUE_MODE``
+      - 0x08
+      - If is set, the segment feature data mode is delta-value.
+        If cleared, it's absolute-value.
+
+.. raw:: latex
+
+    \normalsize
+
+.. c:type:: v4l2_vp8_loop_filter
+
+.. cssclass:: longtable
+
+.. tabularcolumns:: |p{1.5cm}|p{3.9cm}|p{11.9cm}|
+
+.. flat-table:: struct v4l2_vp8_loop_filter
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __s8
+      - ``ref_frm_delta[4]``
+      - Reference adjustment (signed) delta value.
+    * - __s8
+      - ``mb_mode_delta[4]``
+      - Macroblock prediction mode adjustment (signed) delta value.
+    * - __u8
+      - ``sharpness_level``
+      - Sharpness level
+    * - __u8
+      - ``level``
+      - Filter level
+    * - __u16
+      - ``padding``
+      - Applications and drivers must set this to zero.
+    * - __u32
+      - ``flags``
+      - See :ref:`Loop Filter Flags <vp8_loop_filter_flags>`
+
+.. _vp8_loop_filter_flags:
+
+``Loop Filter Flags``
+
+.. tabularcolumns:: |p{7.0cm}|p{1.2cm}|p{9.1cm}|
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - ``V4L2_VP8_LF_ADJ_ENABLE``
+      - 0x01
+      - Enable/disable macroblock-level loop filter adjustment.
+    * - ``V4L2_VP8_LF_DELTA_UPDATE``
+      - 0x02
+      - Indicates if the delta values used in an adjustment are updated.
+    * - ``V4L2_VP8_LF_FILTER_TYPE_SIMPLE``
+      - 0x04
+      - If set, indicates the filter type is simple.
+        If cleared, the filter type is normal.
+
+.. c:type:: v4l2_vp8_quantization
+
+.. tabularcolumns:: |p{1.5cm}|p{3.5cm}|p{12.3cm}|
+
+.. flat-table:: struct v4l2_vp8_quantization
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``y_ac_qi``
+      - Luma AC coefficient table index.
+    * - __s8
+      - ``y_dc_delta``
+      - Luma DC delta vaue.
+    * - __s8
+      - ``y2_dc_delta``
+      - Y2 block DC delta value.
+    * - __s8
+      - ``y2_ac_delta``
+      - Y2 block AC delta value.
+    * - __s8
+      - ``uv_dc_delta``
+      - Chroma DC delta value.
+    * - __s8
+      - ``uv_ac_delta``
+      - Chroma AC delta value.
+    * - __u16
+      - ``padding``
+      - Applications and drivers must set this to zero.
+
+.. c:type:: v4l2_vp8_entropy
+
+.. cssclass:: longtable
+
+.. tabularcolumns:: |p{1.5cm}|p{5.8cm}|p{10.0cm}|
+
+.. flat-table:: struct v4l2_vp8_entropy
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u8
+      - ``coeff_probs[4][8][3][11]``
+      - Coefficient update probabilities.
+    * - __u8
+      - ``y_mode_probs[4]``
+      - Luma mode update probabilities.
+    * - __u8
+      - ``uv_mode_probs[3]``
+      - Chroma mode update probabilities.
+    * - __u8
+      - ``mv_probs[2][19]``
+      - MV decoding update probabilities.
+    * - __u8
+      - ``padding[3]``
+      - Applications and drivers must set this to zero.
diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 1fa9a54f63bf..8a48ac30bc6a 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -1798,345 +1798,6 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
     Quantization parameter for a P frame for FWHT. Valid range: from 1
     to 31.
 
-.. _v4l2-mpeg-vp8:
-
-``V4L2_CID_MPEG_VIDEO_VP8_FRAME (struct)``
-    Specifies the frame parameters for the associated VP8 parsed frame data.
-    This includes the necessary parameters for
-    configuring a stateless hardware decoding pipeline for VP8.
-    The bitstream parameters are defined according to :ref:`vp8`.
-
-    .. note::
-
-       This compound control is not yet part of the public kernel API and
-       it is expected to change.
-
-.. c:type:: v4l2_ctrl_vp8_frame
-
-.. raw:: latex
-
-    \small
-
-.. tabularcolumns:: |p{7.0cm}|p{4.6cm}|p{5.7cm}|
-
-.. cssclass:: longtable
-
-.. flat-table:: struct v4l2_ctrl_vp8_frame
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - struct :c:type:`v4l2_vp8_segment`
-      - ``segment``
-      - Structure with segment-based adjustments metadata.
-    * - struct :c:type:`v4l2_vp8_loop_filter`
-      - ``lf``
-      - Structure with loop filter level adjustments metadata.
-    * - struct :c:type:`v4l2_vp8_quantization`
-      - ``quant``
-      - Structure with VP8 dequantization indices metadata.
-    * - struct :c:type:`v4l2_vp8_entropy`
-      - ``entropy``
-      - Structure with VP8 entropy coder probabilities metadata.
-    * - struct :c:type:`v4l2_vp8_entropy_coder_state`
-      - ``coder_state``
-      - Structure with VP8 entropy coder state.
-    * - __u16
-      - ``width``
-      - The width of the frame. Must be set for all frames.
-    * - __u16
-      - ``height``
-      - The height of the frame. Must be set for all frames.
-    * - __u8
-      - ``horizontal_scale``
-      - Horizontal scaling factor.
-    * - __u8
-      - ``vertical_scaling factor``
-      - Vertical scale.
-    * - __u8
-      - ``version``
-      - Bitstream version.
-    * - __u8
-      - ``prob_skip_false``
-      - Indicates the probability that the macroblock is not skipped.
-    * - __u8
-      - ``prob_intra``
-      - Indicates the probability that a macroblock is intra-predicted.
-    * - __u8
-      - ``prob_last``
-      - Indicates the probability that the last reference frame is used
-        for inter-prediction
-    * - __u8
-      - ``prob_gf``
-      - Indicates the probability that the golden reference frame is used
-        for inter-prediction
-    * - __u8
-      - ``num_dct_parts``
-      - Number of DCT coefficients partitions. Must be one of: 1, 2, 4, or 8.
-    * - __u32
-      - ``first_part_size``
-      - Size of the first partition, i.e. the control partition.
-    * - __u32
-      - ``first_part_header_bits``
-      - Size in bits of the first partition header portion.
-    * - __u32
-      - ``dct_part_sizes[8]``
-      - DCT coefficients sizes.
-    * - __u64
-      - ``last_frame_ts``
-      - Timestamp for the V4L2 capture buffer to use as last reference frame, used
-        with inter-coded frames. The timestamp refers to the ``timestamp`` field in
-	struct :c:type:`v4l2_buffer`. Use the :c:func:`v4l2_timeval_to_ns()`
-	function to convert the struct :c:type:`timeval` in struct
-	:c:type:`v4l2_buffer` to a __u64.
-    * - __u64
-      - ``golden_frame_ts``
-      - Timestamp for the V4L2 capture buffer to use as last reference frame, used
-        with inter-coded frames. The timestamp refers to the ``timestamp`` field in
-	struct :c:type:`v4l2_buffer`. Use the :c:func:`v4l2_timeval_to_ns()`
-	function to convert the struct :c:type:`timeval` in struct
-	:c:type:`v4l2_buffer` to a __u64.
-    * - __u64
-      - ``alt_frame_ts``
-      - Timestamp for the V4L2 capture buffer to use as alternate reference frame, used
-        with inter-coded frames. The timestamp refers to the ``timestamp`` field in
-	struct :c:type:`v4l2_buffer`. Use the :c:func:`v4l2_timeval_to_ns()`
-	function to convert the struct :c:type:`timeval` in struct
-	:c:type:`v4l2_buffer` to a __u64.
-    * - __u64
-      - ``flags``
-      - See :ref:`Frame Flags <vp8_frame_flags>`
-
-.. raw:: latex
-
-    \normalsize
-
-.. _vp8_frame_flags:
-
-``Frame Flags``
-
-.. tabularcolumns:: |p{9.8cm}|p{0.8cm}|p{6.7cm}|
-
-.. cssclass:: longtable
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_VP8_FRAME_FLAG_KEY_FRAME``
-      - 0x01
-      - Indicates if the frame is a key frame.
-    * - ``V4L2_VP8_FRAME_FLAG_EXPERIMENTAL``
-      - 0x02
-      - Experimental bitstream.
-    * - ``V4L2_VP8_FRAME_FLAG_SHOW_FRAME``
-      - 0x04
-      - Show frame flag, indicates if the frame is for display.
-    * - ``V4L2_VP8_FRAME_FLAG_MB_NO_SKIP_COEFF``
-      - 0x08
-      - Enable/disable skipping of macroblocks with no non-zero coefficients.
-    * - ``V4L2_VP8_FRAME_FLAG_SIGN_BIAS_GOLDEN``
-      - 0x10
-      - Sign of motion vectors when the golden frame is referenced.
-    * - ``V4L2_VP8_FRAME_FLAG_SIGN_BIAS_ALT``
-      - 0x20
-      - Sign of motion vectors when the alt frame is referenced.
-
-.. c:type:: v4l2_vp8_entropy_coder_state
-
-.. cssclass:: longtable
-
-.. tabularcolumns:: |p{1.0cm}|p{2.0cm}|p{14.3cm}|
-
-.. flat-table:: struct v4l2_vp8_entropy_coder_state
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u8
-      - ``range``
-      -
-    * - __u8
-      - ``value``
-      -
-    * - __u8
-      - ``bit_count``
-      -
-    * - __u8
-      - ``padding``
-      - Applications and drivers must set this to zero.
-
-.. c:type:: v4l2_vp8_segment
-
-.. cssclass:: longtable
-
-.. tabularcolumns:: |p{1.2cm}|p{4.0cm}|p{12.1cm}|
-
-.. flat-table:: struct v4l2_vp8_segment
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __s8
-      - ``quant_update[4]``
-      - Signed quantizer value update.
-    * - __s8
-      - ``lf_update[4]``
-      - Signed loop filter level value update.
-    * - __u8
-      - ``segment_probs[3]``
-      - Segment probabilities.
-    * - __u8
-      - ``padding``
-      - Applications and drivers must set this to zero.
-    * - __u32
-      - ``flags``
-      - See :ref:`Segment Flags <vp8_segment_flags>`
-
-.. _vp8_segment_flags:
-
-``Segment Flags``
-
-.. raw:: latex
-
-    \small
-
-.. tabularcolumns:: |p{10cm}|p{1.0cm}|p{6.3cm}|
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_VP8_SEGMENT_FLAG_ENABLED``
-      - 0x01
-      - Enable/disable segment-based adjustments.
-    * - ``V4L2_VP8_SEGMENT_FLAG_UPDATE_MAP``
-      - 0x02
-      - Indicates if the macroblock segmentation map is updated in this frame.
-    * - ``V4L2_VP8_SEGMENT_FLAG_UPDATE_FEATURE_DATA``
-      - 0x04
-      - Indicates if the segment feature data is updated in this frame.
-    * - ``V4L2_VP8_SEGMENT_FLAG_DELTA_VALUE_MODE``
-      - 0x08
-      - If is set, the segment feature data mode is delta-value.
-        If cleared, it's absolute-value.
-
-.. raw:: latex
-
-    \normalsize
-
-.. c:type:: v4l2_vp8_loop_filter
-
-.. cssclass:: longtable
-
-.. tabularcolumns:: |p{1.5cm}|p{3.9cm}|p{11.9cm}|
-
-.. flat-table:: struct v4l2_vp8_loop_filter
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __s8
-      - ``ref_frm_delta[4]``
-      - Reference adjustment (signed) delta value.
-    * - __s8
-      - ``mb_mode_delta[4]``
-      - Macroblock prediction mode adjustment (signed) delta value.
-    * - __u8
-      - ``sharpness_level``
-      - Sharpness level
-    * - __u8
-      - ``level``
-      - Filter level
-    * - __u16
-      - ``padding``
-      - Applications and drivers must set this to zero.
-    * - __u32
-      - ``flags``
-      - See :ref:`Loop Filter Flags <vp8_loop_filter_flags>`
-
-.. _vp8_loop_filter_flags:
-
-``Loop Filter Flags``
-
-.. tabularcolumns:: |p{7.0cm}|p{1.2cm}|p{9.1cm}|
-
-.. flat-table::
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - ``V4L2_VP8_LF_ADJ_ENABLE``
-      - 0x01
-      - Enable/disable macroblock-level loop filter adjustment.
-    * - ``V4L2_VP8_LF_DELTA_UPDATE``
-      - 0x02
-      - Indicates if the delta values used in an adjustment are updated.
-    * - ``V4L2_VP8_LF_FILTER_TYPE_SIMPLE``
-      - 0x04
-      - If set, indicates the filter type is simple.
-        If cleared, the filter type is normal.
-
-.. c:type:: v4l2_vp8_quantization
-
-.. tabularcolumns:: |p{1.5cm}|p{3.5cm}|p{12.3cm}|
-
-.. flat-table:: struct v4l2_vp8_quantization
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u8
-      - ``y_ac_qi``
-      - Luma AC coefficient table index.
-    * - __s8
-      - ``y_dc_delta``
-      - Luma DC delta vaue.
-    * - __s8
-      - ``y2_dc_delta``
-      - Y2 block DC delta value.
-    * - __s8
-      - ``y2_ac_delta``
-      - Y2 block AC delta value.
-    * - __s8
-      - ``uv_dc_delta``
-      - Chroma DC delta value.
-    * - __s8
-      - ``uv_ac_delta``
-      - Chroma AC delta value.
-    * - __u16
-      - ``padding``
-      - Applications and drivers must set this to zero.
-
-.. c:type:: v4l2_vp8_entropy
-
-.. cssclass:: longtable
-
-.. tabularcolumns:: |p{1.5cm}|p{5.8cm}|p{10.0cm}|
-
-.. flat-table:: struct v4l2_vp8_entropy
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-    * - __u8
-      - ``coeff_probs[4][8][3][11]``
-      - Coefficient update probabilities.
-    * - __u8
-      - ``y_mode_probs[4]``
-      - Luma mode update probabilities.
-    * - __u8
-      - ``uv_mode_probs[3]``
-      - Chroma mode update probabilities.
-    * - __u8
-      - ``mv_probs[2][19]``
-      - MV decoding update probabilities.
-    * - __u8
-      - ``padding[3]``
-      - Applications and drivers must set this to zero.
-
 .. raw:: latex
 
     \normalsize
diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
index 0232d0808519..6dba70da822b 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst
@@ -153,12 +153,12 @@ Compressed Formats
 
       - ``V4L2_PIX_FMT_VP8_FRAME``
       - 'VP8F'
-      - VP8 parsed frame, as extracted from the container.
-	This format is adapted for stateless video decoders that implement a
-	VP8 pipeline (using the :ref:`mem2mem` and :ref:`media-request-api`).
+      - VP8 parsed frame, including the frame header, as extracted from the container.
+	This format is adapted for stateless video decoders that implement an
+	VP8 pipeline with the :ref:`stateless_decoder`.
 	Metadata associated with the frame to decode is required to be passed
-	through the ``V4L2_CID_MPEG_VIDEO_VP8_FRAME`` control.
-	See the :ref:`associated Codec Control IDs <v4l2-mpeg-vp8>`.
+	through the ``V4L2_CID_STATELESS_VP8_FRAME`` control.
+	See the :ref:`associated Codec Control IDs <v4l2-codec-stateless-vp8>`.
 	Exactly one output and one capture buffer must be provided for use with
 	this pixel format. The output buffer must contain the appropriate number
 	of macroblocks to decode a full corresponding frame to the matching
diff --git a/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst b/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst
index 01773f01c4a7..e5c4e0a175a7 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst
@@ -216,6 +216,10 @@ still cause this situation.
       - ``p_fwht_params``
       - A pointer to a struct :c:type:`v4l2_ctrl_fwht_params`. Valid if this control is
         of type ``V4L2_CTRL_TYPE_FWHT_PARAMS``.
+    * - struct :c:type:`v4l2_ctrl_vp8_frame` *
+      - ``p_vp8_frame``
+      - A pointer to a struct :c:type:`v4l2_ctrl_vp8_frame`. Valid if this control is
+        of type ``V4L2_CTRL_TYPE_VP8_FRAME``.
     * - void *
       - ``ptr``
       - A pointer to a compound type which can be an N-dimensional array
diff --git a/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst b/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst
index 9af43f913694..8a285daedc6a 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst
@@ -489,6 +489,12 @@ See also the examples in :ref:`control`.
       - n/a
       - A struct :c:type:`v4l2_ctrl_hevc_slice_params`, containing HEVC
 	slice parameters for stateless video decoders.
+    * - ``V4L2_CTRL_TYPE_VP8_FRAME``
+      - n/a
+      - n/a
+      - n/a
+      - A struct :c:type:`v4l2_ctrl_vp8_frame`, containing VP8
+	frame parameters for stateless video decoders.
 
 .. raw:: latex
 
diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
index 0ed170c6e720..afb1552b2d39 100644
--- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions
+++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
@@ -147,6 +147,7 @@ replace symbol V4L2_CTRL_TYPE_HEVC_PPS :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_AREA :c:type:`v4l2_ctrl_type`
 replace symbol V4L2_CTRL_TYPE_FWHT_PARAMS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_VP8_FRAME :c:type:`v4l2_ctrl_type`
 
 # V4L2 capability defines
 replace define V4L2_CAP_VIDEO_CAPTURE device-capabilities
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index b3fc293bb8f0..36a7983f7206 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -974,7 +974,6 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_VP8_PROFILE:			return "VP8 Profile";
 	case V4L2_CID_MPEG_VIDEO_VP9_PROFILE:			return "VP9 Profile";
 	case V4L2_CID_MPEG_VIDEO_VP9_LEVEL:			return "VP9 Level";
-	case V4L2_CID_MPEG_VIDEO_VP8_FRAME:			return "VP8 Frame Parameters";
 
 	/* HEVC controls */
 	case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_QP:		return "HEVC I-Frame QP Value";
@@ -1204,6 +1203,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_STATELESS_H264_SLICE_PARAMS:		return "H264 Slice Parameters";
 	case V4L2_CID_STATELESS_H264_DECODE_PARAMS:		return "H264 Decode Parameters";
 	case V4L2_CID_STATELESS_FWHT_PARAMS:			return "FWHT Stateless Parameters";
+	case V4L2_CID_STATELESS_VP8_FRAME:			return "VP8 Frame Parameters";
 	default:
 		return NULL;
 	}
@@ -1476,7 +1476,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_STATELESS_H264_PRED_WEIGHTS:
 		*type = V4L2_CTRL_TYPE_H264_PRED_WEIGHTS;
 		break;
-	case V4L2_CID_MPEG_VIDEO_VP8_FRAME:
+	case V4L2_CID_STATELESS_VP8_FRAME:
 		*type = V4L2_CTRL_TYPE_VP8_FRAME;
 		break;
 	case V4L2_CID_MPEG_VIDEO_HEVC_SPS:
diff --git a/drivers/staging/media/hantro/hantro_drv.c b/drivers/staging/media/hantro/hantro_drv.c
index f1e7f0732e27..595e82a82728 100644
--- a/drivers/staging/media/hantro/hantro_drv.c
+++ b/drivers/staging/media/hantro/hantro_drv.c
@@ -299,7 +299,7 @@ static const struct hantro_ctrl controls[] = {
 	}, {
 		.codec = HANTRO_VP8_DECODER,
 		.cfg = {
-			.id = V4L2_CID_MPEG_VIDEO_VP8_FRAME,
+			.id = V4L2_CID_STATELESS_VP8_FRAME,
 		},
 	}, {
 		.codec = HANTRO_H264_DECODER,
diff --git a/drivers/staging/media/hantro/hantro_g1_vp8_dec.c b/drivers/staging/media/hantro/hantro_g1_vp8_dec.c
index 7d6270e26297..57002ba70176 100644
--- a/drivers/staging/media/hantro/hantro_g1_vp8_dec.c
+++ b/drivers/staging/media/hantro/hantro_g1_vp8_dec.c
@@ -10,7 +10,6 @@
  */
 
 #include <media/v4l2-mem2mem.h>
-#include <media/vp8-ctrls.h>
 
 #include "hantro_hw.h"
 #include "hantro.h"
@@ -437,7 +436,7 @@ void hantro_g1_vp8_dec_run(struct hantro_ctx *ctx)
 
 	hantro_start_prepare_run(ctx);
 
-	hdr = hantro_get_ctrl(ctx, V4L2_CID_MPEG_VIDEO_VP8_FRAME);
+	hdr = hantro_get_ctrl(ctx, V4L2_CID_STATELESS_VP8_FRAME);
 	if (WARN_ON(!hdr))
 		return;
 
diff --git a/drivers/staging/media/hantro/rk3399_vpu_hw_vp8_dec.c b/drivers/staging/media/hantro/rk3399_vpu_hw_vp8_dec.c
index caa1435008d4..8661a3cc1e6b 100644
--- a/drivers/staging/media/hantro/rk3399_vpu_hw_vp8_dec.c
+++ b/drivers/staging/media/hantro/rk3399_vpu_hw_vp8_dec.c
@@ -13,7 +13,6 @@
  */
 
 #include <media/v4l2-mem2mem.h>
-#include <media/vp8-ctrls.h>
 
 #include "hantro_hw.h"
 #include "hantro.h"
@@ -515,7 +514,7 @@ void rk3399_vpu_vp8_dec_run(struct hantro_ctx *ctx)
 
 	hantro_start_prepare_run(ctx);
 
-	hdr = hantro_get_ctrl(ctx, V4L2_CID_MPEG_VIDEO_VP8_FRAME);
+	hdr = hantro_get_ctrl(ctx, V4L2_CID_STATELESS_VP8_FRAME);
 	if (WARN_ON(!hdr))
 		return;
 
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus.c b/drivers/staging/media/sunxi/cedrus/cedrus.c
index e233feb49c9c..92812d1a39d4 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus.c
@@ -147,7 +147,7 @@ static const struct cedrus_control cedrus_controls[] = {
 	},
 	{
 		.cfg = {
-			.id	= V4L2_CID_MPEG_VIDEO_VP8_FRAME,
+			.id	= V4L2_CID_STATELESS_VP8_FRAME,
 		},
 		.codec		= CEDRUS_CODEC_VP8,
 	},
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
index b0a0559cd0eb..d696b3ec70c0 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_dec.c
@@ -72,7 +72,7 @@ void cedrus_device_run(void *priv)
 
 	case V4L2_PIX_FMT_VP8_FRAME:
 		run.vp8.frame_params = cedrus_find_control_data(ctx,
-			V4L2_CID_MPEG_VIDEO_VP8_FRAME);
+			V4L2_CID_STATELESS_VP8_FRAME);
 		break;
 
 	default:
diff --git a/drivers/staging/media/sunxi/cedrus/cedrus_vp8.c b/drivers/staging/media/sunxi/cedrus/cedrus_vp8.c
index 12b233d28db3..f4016684b32d 100644
--- a/drivers/staging/media/sunxi/cedrus/cedrus_vp8.c
+++ b/drivers/staging/media/sunxi/cedrus/cedrus_vp8.c
@@ -423,7 +423,8 @@ static const u8 prob_table_init[] = {
  * This table is a copy of k_mv_entropy_update_probs from the VP8
  * specification.
  *
- * FIXME: If any other driver uses it, move this table to media/vp8-ctrls.h
+ * FIXME: If any other driver uses it, we can consider moving
+ * this table so it can be shared.
  */
 static const u8 k_mv_entropy_update_probs[2][V4L2_VP8_MV_PROB_CNT] = {
 	{ 237, 246, 253, 253, 254, 254, 254, 254, 254,
diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h
index aa06ff1695fe..dea65c92e86b 100644
--- a/include/media/v4l2-ctrls.h
+++ b/include/media/v4l2-ctrls.h
@@ -18,7 +18,6 @@
  * This will move to the public headers once this API is fully stable.
  */
 #include <media/mpeg2-ctrls.h>
-#include <media/vp8-ctrls.h>
 #include <media/hevc-ctrls.h>
 
 /* forward references */
diff --git a/include/media/vp8-ctrls.h b/include/media/vp8-ctrls.h
deleted file mode 100644
index d19bf25e836d..000000000000
--- a/include/media/vp8-ctrls.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * These are the VP8 state controls for use with stateless VP8
- * codec drivers.
- *
- * It turns out that these structs are not stable yet and will undergo
- * more changes. So keep them private until they are stable and ready to
- * become part of the official public API.
- */
-
-#ifndef _VP8_CTRLS_H_
-#define _VP8_CTRLS_H_
-
-#include <linux/types.h>
-
-#define V4L2_CID_MPEG_VIDEO_VP8_FRAME (V4L2_CID_CODEC_BASE + 2000)
-
-#define V4L2_VP8_SEGMENT_FLAG_ENABLED              0x01
-#define V4L2_VP8_SEGMENT_FLAG_UPDATE_MAP           0x02
-#define V4L2_VP8_SEGMENT_FLAG_UPDATE_FEATURE_DATA  0x04
-#define V4L2_VP8_SEGMENT_FLAG_DELTA_VALUE_MODE     0x08
-
-/**
- * struct v4l2_vp8_segment - VP8 segment-based adjustments parameters
- *
- * @quant_update: update values for the segment quantizer.
- * @lf_update: update values for the loop filter level.
- * @segment_probs: branch probabilities of the segment_id decoding tree.
- * @padding: padding field. Should be zeroed by applications.
- * @flags: see V4L2_VP8_SEGMENT_FLAG_{}.
- *
- * This structure contains segment-based adjustments related parameters.
- * See the 'update_segmentation()' part of the frame header syntax,
- * and section '9.3. Segment-Based Adjustments' of the VP8 specification
- * for more details.
- */
-struct v4l2_vp8_segment {
-	__s8 quant_update[4];
-	__s8 lf_update[4];
-	__u8 segment_probs[3];
-	__u8 padding;
-	__u32 flags;
-};
-
-#define V4L2_VP8_LF_ADJ_ENABLE	0x01
-#define V4L2_VP8_LF_DELTA_UPDATE	0x02
-#define V4L2_VP8_LF_FILTER_TYPE_SIMPLE	0x04
-
-/**
- * struct v4l2_vp8_loop_filter - VP8 loop filter parameters
- *
- * @ref_frm_delta: Reference frame signed delta values.
- * @mb_mode_delta: MB prediction mode signed delta values.
- * @sharpness_level: matches sharpness_level syntax element.
- * @level: matches loop_filter_level syntax element.
- * @padding: padding field. Should be zeroed by applications.
- * @flags: see V4L2_VP8_LF_FLAG_{}.
- *
- * This structure contains loop filter related parameters.
- * See the 'mb_lf_adjustments()' part of the frame header syntax,
- * and section '9.4. Loop Filter Type and Levels' of the VP8 specification
- * for more details.
- */
-struct v4l2_vp8_loop_filter {
-	__s8 ref_frm_delta[4];
-	__s8 mb_mode_delta[4];
-	__u8 sharpness_level;
-	__u8 level;
-	__u16 padding;
-	__u32 flags;
-};
-
-/**
- * struct v4l2_vp8_quantization - VP8 quantizattion indices
- *
- * @y_ac_qi: luma AC coefficient table index.
- * @y_dc_delta: luma DC delta vaue.
- * @y2_dc_delta: y2 block DC delta value.
- * @y2_ac_delta: y2 block AC delta value.
- * @uv_dc_delta: chroma DC delta value.
- * @uv_ac_delta: chroma AC delta value.
- * @padding: padding field. Should be zeroed by applications.
-
- * This structure contains the quantization indices present
- * in 'quant_indices()' part of the frame header syntax.
- * See section '9.6. Dequantization Indices' of the VP8 specification
- * for more details.
- */
-struct v4l2_vp8_quantization {
-	__u8 y_ac_qi;
-	__s8 y_dc_delta;
-	__s8 y2_dc_delta;
-	__s8 y2_ac_delta;
-	__s8 uv_dc_delta;
-	__s8 uv_ac_delta;
-	__u16 padding;
-};
-
-#define V4L2_VP8_COEFF_PROB_CNT 11
-#define V4L2_VP8_MV_PROB_CNT 19
-
-/**
- * struct v4l2_vp8_entropy - VP8 update probabilities
- *
- * @coeff_probs: coefficient probability update values.
- * @y_mode_probs: luma intra-prediction probabilities.
- * @uv_mode_probs: chroma intra-prediction probabilities.
- * @mv_probs: mv decoding probability.
- * @padding: padding field. Should be zeroed by applications.
- *
- * This structure contains the update probabilities present in
- * 'token_prob_update()' and 'mv_prob_update()' part of the frame header.
- * See section '17.2. Probability Updates' of the VP8 specification
- * for more details.
- */
-struct v4l2_vp8_entropy {
-	__u8 coeff_probs[4][8][3][V4L2_VP8_COEFF_PROB_CNT];
-	__u8 y_mode_probs[4];
-	__u8 uv_mode_probs[3];
-	__u8 mv_probs[2][V4L2_VP8_MV_PROB_CNT];
-	__u8 padding[3];
-};
-
-/**
- * struct v4l2_vp8_entropy_coder_state - VP8 boolean coder state
- *
- * @range: coder state value for "Range"
- * @value: coder state value for "Value"
- * @bit_count: number of bits left in range "Value".
- * @padding: padding field. Should be zeroed by applications.
- *
- * This structure contains the state for the boolean coder, as
- * explained in section '7. Boolean Entropy Decoder' of the VP8 specification.
- */
-struct v4l2_vp8_entropy_coder_state {
-	__u8 range;
-	__u8 value;
-	__u8 bit_count;
-	__u8 padding;
-};
-
-#define V4L2_VP8_FRAME_FLAG_KEY_FRAME		0x01
-#define V4L2_VP8_FRAME_FLAG_EXPERIMENTAL		0x02
-#define V4L2_VP8_FRAME_FLAG_SHOW_FRAME		0x04
-#define V4L2_VP8_FRAME_FLAG_MB_NO_SKIP_COEFF	0x08
-#define V4L2_VP8_FRAME_FLAG_SIGN_BIAS_GOLDEN	0x10
-#define V4L2_VP8_FRAME_FLAG_SIGN_BIAS_ALT	0x20
-
-#define V4L2_VP8_FRAME_IS_KEY_FRAME(hdr) \
-	(!!((hdr)->flags & V4L2_VP8_FRAME_FLAG_KEY_FRAME))
-
-/**
- * struct v4l2_vp8_frame - VP8 frame parameters
- *
- * @seg: segmentation parameters. See &v4l2_vp8_segment for more details
- * @lf: loop filter parameters. See &v4l2_vp8_loop_filter for more details
- * @quant: quantization parameters. See &v4l2_vp8_quantization for more details
- * @probs: probabilities. See &v4l2_vp9_probabilities for more details
- * @width: frame width.
- * @height: frame height.
- * @horizontal_scale: horizontal scaling factor.
- * @vertical_scale: vertical scaling factor.
- * @version: bitstream version.
- * @prob_skip_false: frame header syntax element.
- * @prob_intra: frame header syntax element.
- * @prob_last: frame header syntax element.
- * @prob_gf: frame header syntax element.
- * @num_dct_parts: number of DCT coefficients partitions.
- * @first_part_size: size of the first partition, i.e. the control partition.
- * @first_part_header_bits: size in bits of the first partition header portion.
- * @dct_part_sizes: DCT coefficients sizes.
- * @last_frame_ts: "last" reference buffer timestamp.
- * The timestamp refers to the timestamp field in struct v4l2_buffer.
- * Use v4l2_timeval_to_ns() to convert the struct timeval to a __u64.
- * @golden_frame_ts: "golden" reference buffer timestamp.
- * @alt_frame_ts: "alt" reference buffer timestamp.
- * @flags: see V4L2_VP8_FRAME_FLAG_{}.
- */
-struct v4l2_ctrl_vp8_frame {
-	struct v4l2_vp8_segment segment;
-	struct v4l2_vp8_loop_filter lf;
-	struct v4l2_vp8_quantization quant;
-	struct v4l2_vp8_entropy entropy;
-	struct v4l2_vp8_entropy_coder_state coder_state;
-
-	__u16 width;
-	__u16 height;
-
-	__u8 horizontal_scale;
-	__u8 vertical_scale;
-
-	__u8 version;
-	__u8 prob_skip_false;
-	__u8 prob_intra;
-	__u8 prob_last;
-	__u8 prob_gf;
-	__u8 num_dct_parts;
-
-	__u32 first_part_size;
-	__u32 first_part_header_bits;
-	__u32 dct_part_sizes[8];
-
-	__u64 last_frame_ts;
-	__u64 golden_frame_ts;
-	__u64 alt_frame_ts;
-
-	__u64 flags;
-};
-
-#endif
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 6f8c08507bf4..49c7761aae82 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -1661,6 +1661,201 @@ struct v4l2_ctrl_fwht_params {
 	__u32 quantization;
 };
 
+/* Stateless VP8 control */
+
+#define V4L2_VP8_SEGMENT_FLAG_ENABLED              0x01
+#define V4L2_VP8_SEGMENT_FLAG_UPDATE_MAP           0x02
+#define V4L2_VP8_SEGMENT_FLAG_UPDATE_FEATURE_DATA  0x04
+#define V4L2_VP8_SEGMENT_FLAG_DELTA_VALUE_MODE     0x08
+
+/**
+ * struct v4l2_vp8_segment - VP8 segment-based adjustments parameters
+ *
+ * @quant_update: update values for the segment quantizer.
+ * @lf_update: update values for the loop filter level.
+ * @segment_probs: branch probabilities of the segment_id decoding tree.
+ * @padding: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_VP8_SEGMENT_FLAG_{}.
+ *
+ * This structure contains segment-based adjustments related parameters.
+ * See the 'update_segmentation()' part of the frame header syntax,
+ * and section '9.3. Segment-Based Adjustments' of the VP8 specification
+ * for more details.
+ */
+struct v4l2_vp8_segment {
+	__s8 quant_update[4];
+	__s8 lf_update[4];
+	__u8 segment_probs[3];
+	__u8 padding;
+	__u32 flags;
+};
+
+#define V4L2_VP8_LF_ADJ_ENABLE	0x01
+#define V4L2_VP8_LF_DELTA_UPDATE	0x02
+#define V4L2_VP8_LF_FILTER_TYPE_SIMPLE	0x04
+
+/**
+ * struct v4l2_vp8_loop_filter - VP8 loop filter parameters
+ *
+ * @ref_frm_delta: Reference frame signed delta values.
+ * @mb_mode_delta: MB prediction mode signed delta values.
+ * @sharpness_level: matches sharpness_level syntax element.
+ * @level: matches loop_filter_level syntax element.
+ * @padding: padding field. Should be zeroed by applications.
+ * @flags: see V4L2_VP8_LF_FLAG_{}.
+ *
+ * This structure contains loop filter related parameters.
+ * See the 'mb_lf_adjustments()' part of the frame header syntax,
+ * and section '9.4. Loop Filter Type and Levels' of the VP8 specification
+ * for more details.
+ */
+struct v4l2_vp8_loop_filter {
+	__s8 ref_frm_delta[4];
+	__s8 mb_mode_delta[4];
+	__u8 sharpness_level;
+	__u8 level;
+	__u16 padding;
+	__u32 flags;
+};
+
+/**
+ * struct v4l2_vp8_quantization - VP8 quantizattion indices
+ *
+ * @y_ac_qi: luma AC coefficient table index.
+ * @y_dc_delta: luma DC delta vaue.
+ * @y2_dc_delta: y2 block DC delta value.
+ * @y2_ac_delta: y2 block AC delta value.
+ * @uv_dc_delta: chroma DC delta value.
+ * @uv_ac_delta: chroma AC delta value.
+ * @padding: padding field. Should be zeroed by applications.
+
+ * This structure contains the quantization indices present
+ * in 'quant_indices()' part of the frame header syntax.
+ * See section '9.6. Dequantization Indices' of the VP8 specification
+ * for more details.
+ */
+struct v4l2_vp8_quantization {
+	__u8 y_ac_qi;
+	__s8 y_dc_delta;
+	__s8 y2_dc_delta;
+	__s8 y2_ac_delta;
+	__s8 uv_dc_delta;
+	__s8 uv_ac_delta;
+	__u16 padding;
+};
+
+#define V4L2_VP8_COEFF_PROB_CNT 11
+#define V4L2_VP8_MV_PROB_CNT 19
+
+/**
+ * struct v4l2_vp8_entropy - VP8 update probabilities
+ *
+ * @coeff_probs: coefficient probability update values.
+ * @y_mode_probs: luma intra-prediction probabilities.
+ * @uv_mode_probs: chroma intra-prediction probabilities.
+ * @mv_probs: mv decoding probability.
+ * @padding: padding field. Should be zeroed by applications.
+ *
+ * This structure contains the update probabilities present in
+ * 'token_prob_update()' and 'mv_prob_update()' part of the frame header.
+ * See section '17.2. Probability Updates' of the VP8 specification
+ * for more details.
+ */
+struct v4l2_vp8_entropy {
+	__u8 coeff_probs[4][8][3][V4L2_VP8_COEFF_PROB_CNT];
+	__u8 y_mode_probs[4];
+	__u8 uv_mode_probs[3];
+	__u8 mv_probs[2][V4L2_VP8_MV_PROB_CNT];
+	__u8 padding[3];
+};
+
+/**
+ * struct v4l2_vp8_entropy_coder_state - VP8 boolean coder state
+ *
+ * @range: coder state value for "Range"
+ * @value: coder state value for "Value"
+ * @bit_count: number of bits left in range "Value".
+ * @padding: padding field. Should be zeroed by applications.
+ *
+ * This structure contains the state for the boolean coder, as
+ * explained in section '7. Boolean Entropy Decoder' of the VP8 specification.
+ */
+struct v4l2_vp8_entropy_coder_state {
+	__u8 range;
+	__u8 value;
+	__u8 bit_count;
+	__u8 padding;
+};
+
+#define V4L2_VP8_FRAME_FLAG_KEY_FRAME		0x01
+#define V4L2_VP8_FRAME_FLAG_EXPERIMENTAL		0x02
+#define V4L2_VP8_FRAME_FLAG_SHOW_FRAME		0x04
+#define V4L2_VP8_FRAME_FLAG_MB_NO_SKIP_COEFF	0x08
+#define V4L2_VP8_FRAME_FLAG_SIGN_BIAS_GOLDEN	0x10
+#define V4L2_VP8_FRAME_FLAG_SIGN_BIAS_ALT	0x20
+
+#define V4L2_VP8_FRAME_IS_KEY_FRAME(hdr) \
+	(!!((hdr)->flags & V4L2_VP8_FRAME_FLAG_KEY_FRAME))
+
+#define V4L2_CID_STATELESS_VP8_FRAME (V4L2_CID_CODEC_STATELESS_BASE + 200)
+/**
+ * struct v4l2_vp8_frame - VP8 frame parameters
+ *
+ * @seg: segmentation parameters. See &v4l2_vp8_segment for more details
+ * @lf: loop filter parameters. See &v4l2_vp8_loop_filter for more details
+ * @quant: quantization parameters. See &v4l2_vp8_quantization for more details
+ * @probs: probabilities. See &v4l2_vp9_probabilities for more details
+ * @width: frame width.
+ * @height: frame height.
+ * @horizontal_scale: horizontal scaling factor.
+ * @vertical_scale: vertical scaling factor.
+ * @version: bitstream version.
+ * @prob_skip_false: frame header syntax element.
+ * @prob_intra: frame header syntax element.
+ * @prob_last: frame header syntax element.
+ * @prob_gf: frame header syntax element.
+ * @num_dct_parts: number of DCT coefficients partitions.
+ * @first_part_size: size of the first partition, i.e. the control partition.
+ * @first_part_header_bits: size in bits of the first partition header portion.
+ * @dct_part_sizes: DCT coefficients sizes.
+ * @last_frame_ts: "last" reference buffer timestamp.
+ * The timestamp refers to the timestamp field in struct v4l2_buffer.
+ * Use v4l2_timeval_to_ns() to convert the struct timeval to a __u64.
+ * @golden_frame_ts: "golden" reference buffer timestamp.
+ * @alt_frame_ts: "alt" reference buffer timestamp.
+ * @flags: see V4L2_VP8_FRAME_FLAG_{}.
+ */
+struct v4l2_ctrl_vp8_frame {
+	struct v4l2_vp8_segment segment;
+	struct v4l2_vp8_loop_filter lf;
+	struct v4l2_vp8_quantization quant;
+	struct v4l2_vp8_entropy entropy;
+	struct v4l2_vp8_entropy_coder_state coder_state;
+
+	__u16 width;
+	__u16 height;
+
+	__u8 horizontal_scale;
+	__u8 vertical_scale;
+
+	__u8 version;
+	__u8 prob_skip_false;
+	__u8 prob_intra;
+	__u8 prob_last;
+	__u8 prob_gf;
+	__u8 num_dct_parts;
+
+	__u32 first_part_size;
+	__u32 first_part_header_bits;
+	__u32 dct_part_sizes[8];
+
+	__u64 last_frame_ts;
+	__u64 golden_frame_ts;
+	__u64 alt_frame_ts;
+
+	__u64 flags;
+};
+
 /* MPEG-compression definitions kept for backwards compatibility */
 #ifndef __KERNEL__
 #define V4L2_CTRL_CLASS_MPEG            V4L2_CTRL_CLASS_CODEC
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 611b75df7f17..ec2af5ddf7d7 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1738,6 +1738,7 @@ struct v4l2_ext_control {
 		struct v4l2_ctrl_h264_slice_params __user *p_h264_slice_params;
 		struct v4l2_ctrl_h264_decode_params __user *p_h264_decode_params;
 		struct v4l2_ctrl_fwht_params __user *p_fwht_params;
+		struct v4l2_ctrl_vp8_frame __user *p_vp8_frame;
 		void __user *ptr;
 	};
 } __attribute__ ((packed));
-- 
cgit v1.2.3-71-gd317


From e6a7d7c342cbd2ead32af36d3cfad51eb0a66d37 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@collabora.com>
Date: Sat, 6 Mar 2021 20:07:49 +0100
Subject: media: uapi: vp8: Fix kernel-doc warnings

Fix following warnings:

./scripts/kernel-doc --none include/uapi/linux/v4l2-controls.h

        include/uapi/linux/v4l2-controls.h:1727: warning: bad line:
        include/uapi/linux/v4l2-controls.h:1853: warning: expecting prototype for struct v4l2_vp8_frame. Prototype was for struct v4l2_ctrl_vp8_frame instead
        include/uapi/linux/v4l2-controls.h:1853: warning: Function parameter or member 'segment' not described in 'v4l2_ctrl_vp8_frame'
        include/uapi/linux/v4l2-controls.h:1853: warning: Function parameter or member 'entropy' not described in 'v4l2_ctrl_vp8_frame'
        include/uapi/linux/v4l2-controls.h:1853: warning: Function parameter or member 'coder_state' not described in 'v4l2_ctrl_vp8_frame'

Reported-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/uapi/linux/v4l2-controls.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 49c7761aae82..f3376aafea65 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -1728,7 +1728,7 @@ struct v4l2_vp8_loop_filter {
  * @uv_dc_delta: chroma DC delta value.
  * @uv_ac_delta: chroma AC delta value.
  * @padding: padding field. Should be zeroed by applications.
-
+ *
  * This structure contains the quantization indices present
  * in 'quant_indices()' part of the frame header syntax.
  * See section '9.6. Dequantization Indices' of the VP8 specification
@@ -1799,12 +1799,13 @@ struct v4l2_vp8_entropy_coder_state {
 
 #define V4L2_CID_STATELESS_VP8_FRAME (V4L2_CID_CODEC_STATELESS_BASE + 200)
 /**
- * struct v4l2_vp8_frame - VP8 frame parameters
+ * struct v4l2_ctrl_vp8_frame - VP8 frame parameters
  *
- * @seg: segmentation parameters. See &v4l2_vp8_segment for more details
+ * @segment: segmentation parameters. See &v4l2_vp8_segment for more details
  * @lf: loop filter parameters. See &v4l2_vp8_loop_filter for more details
  * @quant: quantization parameters. See &v4l2_vp8_quantization for more details
- * @probs: probabilities. See &v4l2_vp9_probabilities for more details
+ * @entropy: update probabilities. See &v4l2_vp8_entropy for more details
+ * @coder_state: boolean coder state. See &v4l2_vp8_entropy_coder_state for more details
  * @width: frame width.
  * @height: frame height.
  * @horizontal_scale: horizontal scaling factor.
-- 
cgit v1.2.3-71-gd317


From f12b81e47f48940a6ec82ff308a7d97cd2307442 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Tue, 9 Mar 2021 12:48:20 +0100
Subject: media: core headers: fix kernel-doc warnings

This patch fixes the following kernel-doc warnings:

include/uapi/linux/videodev2.h:996: warning: Function parameter or member 'm' not described in 'v4l2_plane'
include/uapi/linux/videodev2.h:996: warning: Function parameter or member 'reserved' not described in 'v4l2_plane'
include/uapi/linux/videodev2.h:1057: warning: Function parameter or member 'm' not described in 'v4l2_buffer'
include/uapi/linux/videodev2.h:1057: warning: Function parameter or member 'reserved2' not described in 'v4l2_buffer'
include/uapi/linux/videodev2.h:1057: warning: Function parameter or member 'reserved' not described in 'v4l2_buffer'
include/uapi/linux/videodev2.h:1068: warning: Function parameter or member 'tv' not described in 'v4l2_timeval_to_ns'
include/uapi/linux/videodev2.h:1068: warning: Excess function parameter 'ts' description in 'v4l2_timeval_to_ns'
include/uapi/linux/videodev2.h:1138: warning: Function parameter or member 'reserved' not described in 'v4l2_exportbuffer'
include/uapi/linux/videodev2.h:2237: warning: Function parameter or member 'reserved' not described in 'v4l2_plane_pix_format'
include/uapi/linux/videodev2.h:2270: warning: Function parameter or member 'hsv_enc' not described in 'v4l2_pix_format_mplane'
include/uapi/linux/videodev2.h:2270: warning: Function parameter or member 'reserved' not described in 'v4l2_pix_format_mplane'
include/uapi/linux/videodev2.h:2281: warning: Function parameter or member 'reserved' not described in 'v4l2_sdr_format'
include/uapi/linux/videodev2.h:2315: warning: Function parameter or member 'fmt' not described in 'v4l2_format'

include/uapi/linux/v4l2-subdev.h:53: warning: Function parameter or member 'reserved' not described in 'v4l2_subdev_format'
include/uapi/linux/v4l2-subdev.h:66: warning: Function parameter or member 'reserved' not described in 'v4l2_subdev_crop'
include/uapi/linux/v4l2-subdev.h:89: warning: Function parameter or member 'reserved' not described in 'v4l2_subdev_mbus_code_enum'
include/uapi/linux/v4l2-subdev.h:108: warning: Function parameter or member 'min_width' not described in 'v4l2_subdev_frame_size_enum'
include/uapi/linux/v4l2-subdev.h:108: warning: Function parameter or member 'max_width' not described in 'v4l2_subdev_frame_size_enum'
include/uapi/linux/v4l2-subdev.h:108: warning: Function parameter or member 'min_height' not described in 'v4l2_subdev_frame_size_enum'
include/uapi/linux/v4l2-subdev.h:108: warning: Function parameter or member 'max_height' not described in 'v4l2_subdev_frame_size_enum'
include/uapi/linux/v4l2-subdev.h:108: warning: Function parameter or member 'reserved' not described in 'v4l2_subdev_frame_size_enum'
include/uapi/linux/v4l2-subdev.h:119: warning: Function parameter or member 'reserved' not described in 'v4l2_subdev_frame_interval'
include/uapi/linux/v4l2-subdev.h:140: warning: Function parameter or member 'reserved' not described in 'v4l2_subdev_frame_interval_enum'

include/uapi/linux/cec.h:406: warning: Function parameter or member 'raw' not described in 'cec_connector_info'
include/uapi/linux/cec.h:470: warning: Function parameter or member 'flags' not described in 'cec_event'

include/media/v4l2-h264.h:82: warning: Function parameter or member 'reflist' not described in 'v4l2_h264_build_p_ref_list'
include/media/v4l2-h264.h:82: warning: expecting prototype for v4l2_h264_build_b_ref_lists(). Prototype was for v4l2_h264_build_p_ref_list()
instead

include/media/cec.h:50: warning: Function parameter or member 'lock' not described in 'cec_devnode'

include/media/v4l2-jpeg.h:122: warning: Function parameter or member 'num_dht' not described in 'v4l2_jpeg_header'
include/media/v4l2-jpeg.h:122: warning: Function parameter or member 'num_dqt' not described in 'v4l2_jpeg_header'

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/media/cec.h              |  2 +-
 include/media/v4l2-h264.h        |  6 +++---
 include/media/v4l2-jpeg.h        |  2 ++
 include/uapi/linux/cec.h         |  3 ++-
 include/uapi/linux/v4l2-subdev.h | 12 +++++++++++-
 include/uapi/linux/videodev2.h   | 15 ++++++++++++++-
 6 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/media/cec.h b/include/media/cec.h
index cd35ae6b7560..208c9613c07e 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -28,8 +28,8 @@
  * @minor:	device node minor number
  * @registered:	the device was correctly registered
  * @unregistered: the device was unregistered
- * @fhs_lock:	lock to control access to the filehandle list
  * @fhs:	the list of open filehandles (cec_fh)
+ * @lock:	lock to control access to this structure
  *
  * This structure represents a cec-related device node.
  *
diff --git a/include/media/v4l2-h264.h b/include/media/v4l2-h264.h
index d2314f4d4490..4b1c71c935e0 100644
--- a/include/media/v4l2-h264.h
+++ b/include/media/v4l2-h264.h
@@ -66,11 +66,11 @@ v4l2_h264_build_b_ref_lists(const struct v4l2_h264_reflist_builder *builder,
 			    u8 *b0_reflist, u8 *b1_reflist);
 
 /**
- * v4l2_h264_build_b_ref_lists() - Build the P reference list
+ * v4l2_h264_build_p_ref_list() - Build the P reference list
  *
  * @builder: reference list builder context
- * @p_reflist: 16-bytes array used to store the P reference list. Each entry
- *	       is an index in the DPB
+ * @reflist: 16-bytes array used to store the P reference list. Each entry
+ *	     is an index in the DPB
  *
  * This functions builds the P reference lists. This procedure is describe in
  * section '8.2.4 Decoding process for reference picture lists construction'
diff --git a/include/media/v4l2-jpeg.h b/include/media/v4l2-jpeg.h
index ddba2a56c321..3a3344a97678 100644
--- a/include/media/v4l2-jpeg.h
+++ b/include/media/v4l2-jpeg.h
@@ -91,7 +91,9 @@ struct v4l2_jpeg_scan_header {
  * struct v4l2_jpeg_header - parsed JPEG header
  * @sof: pointer to frame header and size
  * @sos: pointer to scan header and size
+ * @num_dht: number of entries in @dht
  * @dht: pointers to huffman tables and sizes
+ * @num_dqt: number of entries in @dqt
  * @dqt: pointers to quantization tables and sizes
  * @frame: parsed frame header
  * @scan: pointer to parsed scan header, optional
diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index 7d1a06c52469..dc8879d179fd 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -396,6 +396,7 @@ struct cec_drm_connector_info {
  * associated with the CEC adapter.
  * @type: connector type (if any)
  * @drm: drm connector info
+ * @raw: array to pad the union
  */
 struct cec_connector_info {
 	__u32 type;
@@ -453,7 +454,7 @@ struct cec_event_lost_msgs {
  * struct cec_event - CEC event structure
  * @ts: the timestamp of when the event was sent.
  * @event: the event.
- * array.
+ * @flags: event flags.
  * @state_change: the event payload for CEC_EVENT_STATE_CHANGE.
  * @lost_msgs: the event payload for CEC_EVENT_LOST_MSGS.
  * @raw: array to pad the union.
diff --git a/include/uapi/linux/v4l2-subdev.h b/include/uapi/linux/v4l2-subdev.h
index a38454d9e0f5..658106f5b5dc 100644
--- a/include/uapi/linux/v4l2-subdev.h
+++ b/include/uapi/linux/v4l2-subdev.h
@@ -44,6 +44,7 @@ enum v4l2_subdev_format_whence {
  * @which: format type (from enum v4l2_subdev_format_whence)
  * @pad: pad number, as reported by the media API
  * @format: media bus format (format code and frame size)
+ * @reserved: drivers and applications must zero this array
  */
 struct v4l2_subdev_format {
 	__u32 which;
@@ -57,6 +58,7 @@ struct v4l2_subdev_format {
  * @which: format type (from enum v4l2_subdev_format_whence)
  * @pad: pad number, as reported by the media API
  * @rect: pad crop rectangle boundaries
+ * @reserved: drivers and applications must zero this array
  */
 struct v4l2_subdev_crop {
 	__u32 which;
@@ -78,6 +80,7 @@ struct v4l2_subdev_crop {
  * @code: format code (MEDIA_BUS_FMT_ definitions)
  * @which: format type (from enum v4l2_subdev_format_whence)
  * @flags: flags set by the driver, (V4L2_SUBDEV_MBUS_CODE_*)
+ * @reserved: drivers and applications must zero this array
  */
 struct v4l2_subdev_mbus_code_enum {
 	__u32 pad;
@@ -90,10 +93,15 @@ struct v4l2_subdev_mbus_code_enum {
 
 /**
  * struct v4l2_subdev_frame_size_enum - Media bus format enumeration
- * @pad: pad number, as reported by the media API
  * @index: format index during enumeration
+ * @pad: pad number, as reported by the media API
  * @code: format code (MEDIA_BUS_FMT_ definitions)
+ * @min_width: minimum frame width, in pixels
+ * @max_width: maximum frame width, in pixels
+ * @min_height: minimum frame height, in pixels
+ * @max_height: maximum frame height, in pixels
  * @which: format type (from enum v4l2_subdev_format_whence)
+ * @reserved: drivers and applications must zero this array
  */
 struct v4l2_subdev_frame_size_enum {
 	__u32 index;
@@ -111,6 +119,7 @@ struct v4l2_subdev_frame_size_enum {
  * struct v4l2_subdev_frame_interval - Pad-level frame rate
  * @pad: pad number, as reported by the media API
  * @interval: frame interval in seconds
+ * @reserved: drivers and applications must zero this array
  */
 struct v4l2_subdev_frame_interval {
 	__u32 pad;
@@ -127,6 +136,7 @@ struct v4l2_subdev_frame_interval {
  * @height: frame height in pixels
  * @interval: frame interval in seconds
  * @which: format type (from enum v4l2_subdev_format_whence)
+ * @reserved: drivers and applications must zero this array
  */
 struct v4l2_subdev_frame_interval_enum {
 	__u32 index;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index ec2af5ddf7d7..dc5dd89327d8 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -976,8 +976,10 @@ struct v4l2_requestbuffers {
  *			pointing to this plane
  * @fd:			when memory is V4L2_MEMORY_DMABUF, a userspace file
  *			descriptor associated with this plane
+ * @m:			union of @mem_offset, @userptr and @fd
  * @data_offset:	offset in the plane to the start of data; usually 0,
  *			unless there is a header in front of the data
+ * @reserved:		drivers and applications must zero this array
  *
  * Multi-planar buffers consist of one or more planes, e.g. an YCbCr buffer
  * with two planes can have one plane for Y, and another for interleaved CbCr
@@ -1019,10 +1021,14 @@ struct v4l2_plane {
  *		a userspace file descriptor associated with this buffer
  * @planes:	for multiplanar buffers; userspace pointer to the array of plane
  *		info structs for this buffer
+ * @m:		union of @offset, @userptr, @planes and @fd
  * @length:	size in bytes of the buffer (NOT its payload) for single-plane
  *		buffers (when type != *_MPLANE); number of elements in the
  *		planes array for multi-plane buffers
+ * @reserved2:	drivers and applications must zero this field
  * @request_fd: fd of the request that this buffer should use
+ * @reserved:	for backwards compatibility with applications that do not know
+ *		about @request_fd
  *
  * Contains data exchanged by application and driver using one of the Streaming
  * I/O methods.
@@ -1060,7 +1066,7 @@ struct v4l2_buffer {
 #ifndef __KERNEL__
 /**
  * v4l2_timeval_to_ns - Convert timeval to nanoseconds
- * @ts:		pointer to the timeval variable to be converted
+ * @tv:		pointer to the timeval variable to be converted
  *
  * Returns the scalar nanosecond representation of the timeval
  * parameter.
@@ -1121,6 +1127,7 @@ static inline __u64 v4l2_timeval_to_ns(const struct timeval *tv)
  * @flags:	flags for newly created file, currently only O_CLOEXEC is
  *		supported, refer to manual of open syscall for more details
  * @fd:		file descriptor associated with DMABUF (set by driver)
+ * @reserved:	drivers and applications must zero this array
  *
  * Contains data used for exporting a video buffer as DMABUF file descriptor.
  * The buffer is identified by a 'cookie' returned by VIDIOC_QUERYBUF
@@ -2233,6 +2240,7 @@ struct v4l2_mpeg_vbi_fmt_ivtv {
  *			this plane will be used
  * @bytesperline:	distance in bytes between the leftmost pixels in two
  *			adjacent lines
+ * @reserved:		drivers and applications must zero this array
  */
 struct v4l2_plane_pix_format {
 	__u32		sizeimage;
@@ -2251,8 +2259,10 @@ struct v4l2_plane_pix_format {
  * @num_planes:		number of planes for this format
  * @flags:		format flags (V4L2_PIX_FMT_FLAG_*)
  * @ycbcr_enc:		enum v4l2_ycbcr_encoding, Y'CbCr encoding
+ * @hsv_enc:		enum v4l2_hsv_encoding, HSV encoding
  * @quantization:	enum v4l2_quantization, colorspace quantization
  * @xfer_func:		enum v4l2_xfer_func, colorspace transfer function
+ * @reserved:		drivers and applications must zero this array
  */
 struct v4l2_pix_format_mplane {
 	__u32				width;
@@ -2277,6 +2287,7 @@ struct v4l2_pix_format_mplane {
  * struct v4l2_sdr_format - SDR format definition
  * @pixelformat:	little endian four character code (fourcc)
  * @buffersize:		maximum size in bytes required for data
+ * @reserved:		drivers and applications must zero this array
  */
 struct v4l2_sdr_format {
 	__u32				pixelformat;
@@ -2303,6 +2314,8 @@ struct v4l2_meta_format {
  * @vbi:	raw VBI capture or output parameters
  * @sliced:	sliced VBI capture or output parameters
  * @raw_data:	placeholder for future extensions and custom formats
+ * @fmt:	union of @pix, @pix_mp, @win, @vbi, @sliced, @sdr, @meta
+ *		and @raw_data
  */
 struct v4l2_format {
 	__u32	 type;
-- 
cgit v1.2.3-71-gd317


From c214e6dd5c9b4073151759502de374c2748cad64 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Tue, 9 Mar 2021 13:22:37 +0100
Subject: media: vpbe_osd.h/uvcvideo.h includes: fix trivial kernel-doc
 warnings

Fix these kernel-doc warnings:

include/media/davinci/vpbe_osd.h:77: warning: Enum value 'PIXFMT_YCBCRI' not described in enum 'osd_pix_format'
include/media/davinci/vpbe_osd.h:77: warning: Enum value 'PIXFMT_YCRCBI' not described in enum 'osd_pix_format'
include/media/davinci/vpbe_osd.h:77: warning: Excess enum value 'PIXFMT_YCrCbI' description in 'osd_pix_format'
include/media/davinci/vpbe_osd.h:77: warning: Excess enum value 'PIXFMT_YCbCrI' description in 'osd_pix_format'
include/media/davinci/vpbe_osd.h:232: warning: expecting prototype for enum davinci_cursor_v_width. Prototype was for enum
osd_cursor_v_width instead
include/uapi/linux/uvcvideo.h:98: warning: Function parameter or member 'ns' not described in 'uvc_meta_buf'
include/uapi/linux/uvcvideo.h:98: warning: Function parameter or member 'sof' not described in 'uvc_meta_buf'
include/uapi/linux/uvcvideo.h:98: warning: Function parameter or member 'length' not described in 'uvc_meta_buf'
include/uapi/linux/uvcvideo.h:98: warning: Function parameter or member 'flags' not described in 'uvc_meta_buf'
include/uapi/linux/uvcvideo.h:98: warning: Function parameter or member 'buf' not described in 'uvc_meta_buf'

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/media/davinci/vpbe_osd.h |  6 +++---
 include/uapi/linux/uvcvideo.h    | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/media/davinci/vpbe_osd.h b/include/media/davinci/vpbe_osd.h
index e1b1c76aa50f..a4fc4f2a56fb 100644
--- a/include/media/davinci/vpbe_osd.h
+++ b/include/media/davinci/vpbe_osd.h
@@ -54,9 +54,9 @@ enum osd_win_layer {
  * @PIXFMT_4BPP: 4-bits-per-pixel bitmap
  * @PIXFMT_8BPP: 8-bits-per-pixel bitmap
  * @PIXFMT_RGB565: 16-bits-per-pixel RGB565
- * @PIXFMT_YCbCrI: YUV 4:2:2
+ * @PIXFMT_YCBCRI: YUV 4:2:2
  * @PIXFMT_RGB888: 24-bits-per-pixel RGB888
- * @PIXFMT_YCrCbI: YUV 4:2:2 with chroma swap
+ * @PIXFMT_YCRCBI: YUV 4:2:2 with chroma swap
  * @PIXFMT_NV12: YUV 4:2:0 planar
  * @PIXFMT_OSD_ATTR: OSD Attribute Window pixel format (4bpp)
  *
@@ -210,7 +210,7 @@ enum osd_cursor_h_width {
 };
 
 /**
- * enum davinci_cursor_v_width
+ * enum osd_cursor_v_width
  * @V_WIDTH_1: vertical line width is 1 line
  * @V_WIDTH_2: vertical line width is 2 lines
  * @V_WIDTH_4: vertical line width is 4 lines
diff --git a/include/uapi/linux/uvcvideo.h b/include/uapi/linux/uvcvideo.h
index f80f05b3c423..8288137387c0 100644
--- a/include/uapi/linux/uvcvideo.h
+++ b/include/uapi/linux/uvcvideo.h
@@ -76,11 +76,11 @@ struct uvc_xu_control_query {
 
 /**
  * struct uvc_meta_buf - metadata buffer building block
- * @ns		- system timestamp of the payload in nanoseconds
- * @sof		- USB Frame Number
- * @length	- length of the payload header
- * @flags	- payload header flags
- * @buf		- optional device-specific header data
+ * @ns: system timestamp of the payload in nanoseconds
+ * @sof: USB Frame Number
+ * @length: length of the payload header
+ * @flags: payload header flags
+ * @buf: optional device-specific header data
  *
  * UVC metadata nodes fill buffers with possibly multiple instances of this
  * struct. The first two fields are added by the driver, they can be used for
-- 
cgit v1.2.3-71-gd317


From 0376a51fbe5e11a59d6a5c55e57cc201da06dbe0 Mon Sep 17 00:00:00 2001
From: Mirela Rabulea <mirela.rabulea@nxp.com>
Date: Thu, 11 Mar 2021 01:28:47 +0100
Subject: media: v4l: Add packed YUV444 24bpp pixel format

The added format is V4L2_PIX_FMT_YUV24, this is a packed
YUV 4:4:4 format, with 8 bits for each component, 24 bits
per sample.

This format is used by the i.MX 8QuadMax and i.MX 8DualXPlus/8QuadXPlus
JPEG encoder/decoder.

Signed-off-by: Mirela Rabulea <mirela.rabulea@nxp.com>
Reviewed-by: Paul Kocialkowski <paul.kocialkowski@bootlin.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst | 10 ++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c                        |  1 +
 include/uapi/linux/videodev2.h                              |  1 +
 3 files changed, 12 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst b/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
index 560fd2ead8ca..65520c3af7cf 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst
@@ -220,6 +220,16 @@ the second byte and Y'\ :sub:`7-0` in the third byte.
       - Y'\ :sub:`7-0`
       - X\ :sub:`7-0`
 
+    * .. _V4L2-PIX-FMT-YUV24:
+
+      - ``V4L2_PIX_FMT_YUV24``
+      - 'YUV3'
+
+      - Y'\ :sub:`7-0`
+      - Cb\ :sub:`7-0`
+      - Cr\ :sub:`7-0`
+      - -\
+
 .. note::
 
     - The alpha component is expected to contain a meaningful value that can be
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 31d1342e61e8..9a160283b3ae 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1304,6 +1304,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_YUV444:	descr = "16-bit A/XYUV 4-4-4-4"; break;
 	case V4L2_PIX_FMT_YUV555:	descr = "16-bit A/XYUV 1-5-5-5"; break;
 	case V4L2_PIX_FMT_YUV565:	descr = "16-bit YUV 5-6-5"; break;
+	case V4L2_PIX_FMT_YUV24:	descr = "24-bit YUV 4:4:4 8-8-8"; break;
 	case V4L2_PIX_FMT_YUV32:	descr = "32-bit A/XYUV 8-8-8-8"; break;
 	case V4L2_PIX_FMT_AYUV32:	descr = "32-bit AYUV 8-8-8-8"; break;
 	case V4L2_PIX_FMT_XYUV32:	descr = "32-bit XYUV 8-8-8-8"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index dc5dd89327d8..8d15f6ccc4b4 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -586,6 +586,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_YUV444  v4l2_fourcc('Y', '4', '4', '4') /* 16  xxxxyyyy uuuuvvvv */
 #define V4L2_PIX_FMT_YUV555  v4l2_fourcc('Y', 'U', 'V', 'O') /* 16  YUV-5-5-5     */
 #define V4L2_PIX_FMT_YUV565  v4l2_fourcc('Y', 'U', 'V', 'P') /* 16  YUV-5-6-5     */
+#define V4L2_PIX_FMT_YUV24   v4l2_fourcc('Y', 'U', 'V', '3') /* 24  YUV-8-8-8     */
 #define V4L2_PIX_FMT_YUV32   v4l2_fourcc('Y', 'U', 'V', '4') /* 32  YUV-8-8-8-8   */
 #define V4L2_PIX_FMT_AYUV32  v4l2_fourcc('A', 'Y', 'U', 'V') /* 32  AYUV-8-8-8-8  */
 #define V4L2_PIX_FMT_XYUV32  v4l2_fourcc('X', 'Y', 'U', 'V') /* 32  XYUV-8-8-8-8  */
-- 
cgit v1.2.3-71-gd317


From 98291d6d8c2f5864b7dfc1950ece4a7563597f13 Mon Sep 17 00:00:00 2001
From: Eric Huang <jinhuieric.huang@amd.com>
Date: Tue, 12 May 2020 15:25:38 -0400
Subject: drm/amdkfd: add new flag for uncached GPU mapping

The macro is for memory mapped by GPU as uncached.

Signed-off-by: Eric Huang <jinhuieric.huang@amd.com>
Reviewed-by: Oak Zeng <Oak.Zeng@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 695b606da4b1..18449f746097 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -351,6 +351,7 @@ struct kfd_ioctl_acquire_vm_args {
 #define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE	(1 << 28)
 #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM	(1 << 27)
 #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT	(1 << 26)
+#define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED	(1 << 25)
 
 /* Allocate memory for later SVM (shared virtual memory) mapping.
  *
-- 
cgit v1.2.3-71-gd317


From 8c44390d8872ebf3a28558b59a0074df39b3da8f Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 9 Mar 2021 15:03:28 -0500
Subject: drm/amdkfd: Bump KFD API version

Indicate the availability reliable SRAM EDC state in the new bit
in the device properties.

Proposed userspace changes:
https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/commit/7cdd63475c36bb9f49bb960f90f9a8cdb7e80a21

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Kent Russell <kent.russell@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/uapi/linux/kfd_ioctl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 18449f746097..bf5e7d7846dd 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -29,9 +29,10 @@
 /*
  * - 1.1 - initial version
  * - 1.3 - Add SMI events support
+ * - 1.4 - Indicate new SRAM EDC bit in device properties
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 3
+#define KFD_IOCTL_MINOR_VERSION 4
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
-- 
cgit v1.2.3-71-gd317


From 25651f2df7833c5554f51652007862a405aac830 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 22 Mar 2021 15:17:48 +0100
Subject: uapi: map_to_7segment: Remove licence boilerplate

Remove the license boilerplate (containing an obsolete address), because
we now have the SPDX header.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20210322141748.1062733-1-geert@linux-m68k.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/map_to_7segment.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/map_to_7segment.h b/include/uapi/linux/map_to_7segment.h
index 8b02088f96e3..04c8b55812e7 100644
--- a/include/uapi/linux/map_to_7segment.h
+++ b/include/uapi/linux/map_to_7segment.h
@@ -1,20 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
 /*
  * Copyright (c) 2005 Henk Vergonet <Henk.Vergonet@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #ifndef MAP_TO_7SEGMENT_H
-- 
cgit v1.2.3-71-gd317


From 432ff1e91694e4c55a5bf6bc0574f4c254970232 Mon Sep 17 00:00:00 2001
From: Marco Ballesio <balejs@google.com>
Date: Mon, 15 Mar 2021 18:16:28 -0700
Subject: binder: BINDER_FREEZE ioctl

Frozen tasks can't process binder transactions, so a way is required to
inform transmitting ends of communication failures due to the frozen
state of their receiving counterparts. Additionally, races are possible
between transitions to frozen state and binder transactions enqueued to
a specific process.

Implement BINDER_FREEZE ioctl for user space to inform the binder driver
about the intention to freeze or unfreeze a process. When the ioctl is
called, block the caller until any pending binder transactions toward
the target process are flushed. Return an error to transactions to
processes marked as frozen.

Co-developed-by: Todd Kjos <tkjos@google.com>
Acked-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Marco Ballesio <balejs@google.com>
Signed-off-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Li Li <dualli@google.com>
Link: https://lore.kernel.org/r/20210316011630.1121213-2-dualli@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 139 +++++++++++++++++++++++++++++++++---
 drivers/android/binder_internal.h   |  12 ++++
 include/uapi/linux/android/binder.h |  13 ++++
 3 files changed, 154 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index c119736ca56a..b93ca53bb90f 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -1506,6 +1506,12 @@ static void binder_free_transaction(struct binder_transaction *t)
 
 	if (target_proc) {
 		binder_inner_proc_lock(target_proc);
+		target_proc->outstanding_txns--;
+		if (target_proc->outstanding_txns < 0)
+			pr_warn("%s: Unexpected outstanding_txns %d\n",
+				__func__, target_proc->outstanding_txns);
+		if (!target_proc->outstanding_txns && target_proc->is_frozen)
+			wake_up_interruptible_all(&target_proc->freeze_wait);
 		if (t->buffer)
 			t->buffer->transaction = NULL;
 		binder_inner_proc_unlock(target_proc);
@@ -2331,10 +2337,11 @@ static int binder_fixup_parent(struct binder_transaction *t,
  * If the @thread parameter is not NULL, the transaction is always queued
  * to the waitlist of that specific thread.
  *
- * Return:	true if the transactions was successfully queued
- *		false if the target process or thread is dead
+ * Return:	0 if the transaction was successfully queued
+ *		BR_DEAD_REPLY if the target process or thread is dead
+ *		BR_FROZEN_REPLY if the target process or thread is frozen
  */
-static bool binder_proc_transaction(struct binder_transaction *t,
+static int binder_proc_transaction(struct binder_transaction *t,
 				    struct binder_proc *proc,
 				    struct binder_thread *thread)
 {
@@ -2354,10 +2361,11 @@ static bool binder_proc_transaction(struct binder_transaction *t,
 
 	binder_inner_proc_lock(proc);
 
-	if (proc->is_dead || (thread && thread->is_dead)) {
+	if ((proc->is_frozen && !oneway) || proc->is_dead ||
+			(thread && thread->is_dead)) {
 		binder_inner_proc_unlock(proc);
 		binder_node_unlock(node);
-		return false;
+		return proc->is_frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY;
 	}
 
 	if (!thread && !pending_async)
@@ -2373,10 +2381,11 @@ static bool binder_proc_transaction(struct binder_transaction *t,
 	if (!pending_async)
 		binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
 
+	proc->outstanding_txns++;
 	binder_inner_proc_unlock(proc);
 	binder_node_unlock(node);
 
-	return true;
+	return 0;
 }
 
 /**
@@ -3013,13 +3022,16 @@ static void binder_transaction(struct binder_proc *proc,
 	if (reply) {
 		binder_enqueue_thread_work(thread, tcomplete);
 		binder_inner_proc_lock(target_proc);
-		if (target_thread->is_dead) {
+		if (target_thread->is_dead || target_proc->is_frozen) {
+			return_error = target_thread->is_dead ?
+				BR_DEAD_REPLY : BR_FROZEN_REPLY;
 			binder_inner_proc_unlock(target_proc);
 			goto err_dead_proc_or_thread;
 		}
 		BUG_ON(t->buffer->async_transaction != 0);
 		binder_pop_transaction_ilocked(target_thread, in_reply_to);
 		binder_enqueue_thread_work_ilocked(target_thread, &t->work);
+		target_proc->outstanding_txns++;
 		binder_inner_proc_unlock(target_proc);
 		wake_up_interruptible_sync(&target_thread->wait);
 		binder_free_transaction(in_reply_to);
@@ -3038,7 +3050,9 @@ static void binder_transaction(struct binder_proc *proc,
 		t->from_parent = thread->transaction_stack;
 		thread->transaction_stack = t;
 		binder_inner_proc_unlock(proc);
-		if (!binder_proc_transaction(t, target_proc, target_thread)) {
+		return_error = binder_proc_transaction(t,
+				target_proc, target_thread);
+		if (return_error) {
 			binder_inner_proc_lock(proc);
 			binder_pop_transaction_ilocked(thread, t);
 			binder_inner_proc_unlock(proc);
@@ -3048,7 +3062,8 @@ static void binder_transaction(struct binder_proc *proc,
 		BUG_ON(target_node == NULL);
 		BUG_ON(t->buffer->async_transaction != 1);
 		binder_enqueue_thread_work(thread, tcomplete);
-		if (!binder_proc_transaction(t, target_proc, NULL))
+		return_error = binder_proc_transaction(t, target_proc, NULL);
+		if (return_error)
 			goto err_dead_proc_or_thread;
 	}
 	if (target_thread)
@@ -3065,7 +3080,6 @@ static void binder_transaction(struct binder_proc *proc,
 	return;
 
 err_dead_proc_or_thread:
-	return_error = BR_DEAD_REPLY;
 	return_error_line = __LINE__;
 	binder_dequeue_work(proc, tcomplete);
 err_translate_failed:
@@ -4298,6 +4312,9 @@ static void binder_free_proc(struct binder_proc *proc)
 
 	BUG_ON(!list_empty(&proc->todo));
 	BUG_ON(!list_empty(&proc->delivered_death));
+	if (proc->outstanding_txns)
+		pr_warn("%s: Unexpected outstanding_txns %d\n",
+			__func__, proc->outstanding_txns);
 	device = container_of(proc->context, struct binder_device, context);
 	if (refcount_dec_and_test(&device->ref)) {
 		kfree(proc->context->name);
@@ -4359,6 +4376,7 @@ static int binder_thread_release(struct binder_proc *proc,
 			     (t->to_thread == thread) ? "in" : "out");
 
 		if (t->to_thread == thread) {
+			thread->proc->outstanding_txns--;
 			t->to_proc = NULL;
 			t->to_thread = NULL;
 			if (t->buffer) {
@@ -4609,6 +4627,45 @@ static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
 	return 0;
 }
 
+static int binder_ioctl_freeze(struct binder_freeze_info *info,
+			       struct binder_proc *target_proc)
+{
+	int ret = 0;
+
+	if (!info->enable) {
+		binder_inner_proc_lock(target_proc);
+		target_proc->is_frozen = false;
+		binder_inner_proc_unlock(target_proc);
+		return 0;
+	}
+
+	/*
+	 * Freezing the target. Prevent new transactions by
+	 * setting frozen state. If timeout specified, wait
+	 * for transactions to drain.
+	 */
+	binder_inner_proc_lock(target_proc);
+	target_proc->is_frozen = true;
+	binder_inner_proc_unlock(target_proc);
+
+	if (info->timeout_ms > 0)
+		ret = wait_event_interruptible_timeout(
+			target_proc->freeze_wait,
+			(!target_proc->outstanding_txns),
+			msecs_to_jiffies(info->timeout_ms));
+
+	if (!ret && target_proc->outstanding_txns)
+		ret = -EAGAIN;
+
+	if (ret < 0) {
+		binder_inner_proc_lock(target_proc);
+		target_proc->is_frozen = false;
+		binder_inner_proc_unlock(target_proc);
+	}
+
+	return ret;
+}
+
 static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int ret;
@@ -4727,6 +4784,66 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		break;
 	}
+	case BINDER_FREEZE: {
+		struct binder_freeze_info info;
+		struct binder_proc **target_procs = NULL, *target_proc;
+		int target_procs_count = 0, i = 0;
+
+		ret = 0;
+
+		if (copy_from_user(&info, ubuf, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		mutex_lock(&binder_procs_lock);
+		hlist_for_each_entry(target_proc, &binder_procs, proc_node) {
+			if (target_proc->pid == info.pid)
+				target_procs_count++;
+		}
+
+		if (target_procs_count == 0) {
+			mutex_unlock(&binder_procs_lock);
+			ret = -EINVAL;
+			goto err;
+		}
+
+		target_procs = kcalloc(target_procs_count,
+				       sizeof(struct binder_proc *),
+				       GFP_KERNEL);
+
+		if (!target_procs) {
+			mutex_unlock(&binder_procs_lock);
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		hlist_for_each_entry(target_proc, &binder_procs, proc_node) {
+			if (target_proc->pid != info.pid)
+				continue;
+
+			binder_inner_proc_lock(target_proc);
+			target_proc->tmp_ref++;
+			binder_inner_proc_unlock(target_proc);
+
+			target_procs[i++] = target_proc;
+		}
+		mutex_unlock(&binder_procs_lock);
+
+		for (i = 0; i < target_procs_count; i++) {
+			if (ret >= 0)
+				ret = binder_ioctl_freeze(&info,
+							  target_procs[i]);
+
+			binder_proc_dec_tmpref(target_procs[i]);
+		}
+
+		kfree(target_procs);
+
+		if (ret < 0)
+			goto err;
+		break;
+	}
 	default:
 		ret = -EINVAL;
 		goto err;
@@ -4823,6 +4940,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	get_task_struct(current->group_leader);
 	proc->tsk = current->group_leader;
 	INIT_LIST_HEAD(&proc->todo);
+	init_waitqueue_head(&proc->freeze_wait);
 	proc->default_priority = task_nice(current);
 	/* binderfs stashes devices in i_private */
 	if (is_binderfs_device(nodp)) {
@@ -5035,6 +5153,7 @@ static void binder_deferred_release(struct binder_proc *proc)
 	proc->tmp_ref++;
 
 	proc->is_dead = true;
+	proc->is_frozen = false;
 	threads = 0;
 	active_transactions = 0;
 	while ((n = rb_first(&proc->threads))) {
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index 6cd79011e35d..e6a53e98c6da 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -367,9 +367,18 @@ struct binder_ref {
  *                        (protected by binder_deferred_lock)
  * @deferred_work:        bitmap of deferred work to perform
  *                        (protected by binder_deferred_lock)
+ * @outstanding_txns:     number of transactions to be transmitted before
+ *                        processes in freeze_wait are woken up
+ *                        (protected by @inner_lock)
  * @is_dead:              process is dead and awaiting free
  *                        when outstanding transactions are cleaned up
  *                        (protected by @inner_lock)
+ * @is_frozen:            process is frozen and unable to service
+ *                        binder transactions
+ *                        (protected by @inner_lock)
+ * @freeze_wait:          waitqueue of processes waiting for all outstanding
+ *                        transactions to be processed
+ *                        (protected by @inner_lock)
  * @todo:                 list of work for this process
  *                        (protected by @inner_lock)
  * @stats:                per-process binder statistics
@@ -410,7 +419,10 @@ struct binder_proc {
 	struct task_struct *tsk;
 	struct hlist_node deferred_work_node;
 	int deferred_work;
+	int outstanding_txns;
 	bool is_dead;
+	bool is_frozen;
+	wait_queue_head_t freeze_wait;
 
 	struct list_head todo;
 	struct binder_stats stats;
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index ec84ad106568..7eb5b818b3c1 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -217,6 +217,12 @@ struct binder_node_info_for_ref {
 	__u32            reserved3;
 };
 
+struct binder_freeze_info {
+	__u32            pid;
+	__u32            enable;
+	__u32            timeout_ms;
+};
+
 #define BINDER_WRITE_READ		_IOWR('b', 1, struct binder_write_read)
 #define BINDER_SET_IDLE_TIMEOUT		_IOW('b', 3, __s64)
 #define BINDER_SET_MAX_THREADS		_IOW('b', 5, __u32)
@@ -227,6 +233,7 @@ struct binder_node_info_for_ref {
 #define BINDER_GET_NODE_DEBUG_INFO	_IOWR('b', 11, struct binder_node_debug_info)
 #define BINDER_GET_NODE_INFO_FOR_REF	_IOWR('b', 12, struct binder_node_info_for_ref)
 #define BINDER_SET_CONTEXT_MGR_EXT	_IOW('b', 13, struct flat_binder_object)
+#define BINDER_FREEZE			_IOW('b', 14, struct binder_freeze_info)
 
 /*
  * NOTE: Two special error codes you should check for when calling
@@ -408,6 +415,12 @@ enum binder_driver_return_protocol {
 	 * The last transaction (either a bcTRANSACTION or
 	 * a bcATTEMPT_ACQUIRE) failed (e.g. out of memory).  No parameters.
 	 */
+
+	BR_FROZEN_REPLY = _IO('r', 18),
+	/*
+	 * The target of the last transaction (either a bcTRANSACTION or
+	 * a bcATTEMPT_ACQUIRE) is frozen.  No parameters.
+	 */
 };
 
 enum binder_driver_command_protocol {
-- 
cgit v1.2.3-71-gd317


From ae28c1be1e54f2eda1c8b4469c4652e8a24056ed Mon Sep 17 00:00:00 2001
From: Marco Ballesio <balejs@google.com>
Date: Mon, 15 Mar 2021 18:16:30 -0700
Subject: binder: BINDER_GET_FROZEN_INFO ioctl

User space needs to know if binder transactions occurred to frozen
processes. Introduce a new BINDER_GET_FROZEN ioctl and keep track of
transactions occurring to frozen proceses.

Signed-off-by: Marco Ballesio <balejs@google.com>
Signed-off-by: Li Li <dualli@google.com>
Acked-by: Todd Kjos <tkjos@google.com>
Link: https://lore.kernel.org/r/20210316011630.1121213-4-dualli@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 55 +++++++++++++++++++++++++++++++++++++
 drivers/android/binder_internal.h   |  6 ++++
 include/uapi/linux/android/binder.h |  7 +++++
 3 files changed, 68 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index fe16c455a76e..e1a484ab0366 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2360,6 +2360,10 @@ static int binder_proc_transaction(struct binder_transaction *t,
 	}
 
 	binder_inner_proc_lock(proc);
+	if (proc->is_frozen) {
+		proc->sync_recv |= !oneway;
+		proc->async_recv |= oneway;
+	}
 
 	if ((proc->is_frozen && !oneway) || proc->is_dead ||
 			(thread && thread->is_dead)) {
@@ -4634,6 +4638,8 @@ static int binder_ioctl_freeze(struct binder_freeze_info *info,
 
 	if (!info->enable) {
 		binder_inner_proc_lock(target_proc);
+		target_proc->sync_recv = false;
+		target_proc->async_recv = false;
 		target_proc->is_frozen = false;
 		binder_inner_proc_unlock(target_proc);
 		return 0;
@@ -4645,6 +4651,8 @@ static int binder_ioctl_freeze(struct binder_freeze_info *info,
 	 * for transactions to drain.
 	 */
 	binder_inner_proc_lock(target_proc);
+	target_proc->sync_recv = false;
+	target_proc->async_recv = false;
 	target_proc->is_frozen = true;
 	binder_inner_proc_unlock(target_proc);
 
@@ -4666,6 +4674,33 @@ static int binder_ioctl_freeze(struct binder_freeze_info *info,
 	return ret;
 }
 
+static int binder_ioctl_get_freezer_info(
+				struct binder_frozen_status_info *info)
+{
+	struct binder_proc *target_proc;
+	bool found = false;
+
+	info->sync_recv = 0;
+	info->async_recv = 0;
+
+	mutex_lock(&binder_procs_lock);
+	hlist_for_each_entry(target_proc, &binder_procs, proc_node) {
+		if (target_proc->pid == info->pid) {
+			found = true;
+			binder_inner_proc_lock(target_proc);
+			info->sync_recv |= target_proc->sync_recv;
+			info->async_recv |= target_proc->async_recv;
+			binder_inner_proc_unlock(target_proc);
+		}
+	}
+	mutex_unlock(&binder_procs_lock);
+
+	if (!found)
+		return -EINVAL;
+
+	return 0;
+}
+
 static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int ret;
@@ -4844,6 +4879,24 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			goto err;
 		break;
 	}
+	case BINDER_GET_FROZEN_INFO: {
+		struct binder_frozen_status_info info;
+
+		if (copy_from_user(&info, ubuf, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		ret = binder_ioctl_get_freezer_info(&info);
+		if (ret < 0)
+			goto err;
+
+		if (copy_to_user(ubuf, &info, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+		break;
+	}
 	default:
 		ret = -EINVAL;
 		goto err;
@@ -5154,6 +5207,8 @@ static void binder_deferred_release(struct binder_proc *proc)
 
 	proc->is_dead = true;
 	proc->is_frozen = false;
+	proc->sync_recv = false;
+	proc->async_recv = false;
 	threads = 0;
 	active_transactions = 0;
 	while ((n = rb_first(&proc->threads))) {
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index e6a53e98c6da..2872a7de68e1 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -376,6 +376,10 @@ struct binder_ref {
  * @is_frozen:            process is frozen and unable to service
  *                        binder transactions
  *                        (protected by @inner_lock)
+ * @sync_recv:            process received sync transactions since last frozen
+ *                        (protected by @inner_lock)
+ * @async_recv:           process received async transactions since last frozen
+ *                        (protected by @inner_lock)
  * @freeze_wait:          waitqueue of processes waiting for all outstanding
  *                        transactions to be processed
  *                        (protected by @inner_lock)
@@ -422,6 +426,8 @@ struct binder_proc {
 	int outstanding_txns;
 	bool is_dead;
 	bool is_frozen;
+	bool sync_recv;
+	bool async_recv;
 	wait_queue_head_t freeze_wait;
 
 	struct list_head todo;
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 7eb5b818b3c1..156070d18c4f 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -223,6 +223,12 @@ struct binder_freeze_info {
 	__u32            timeout_ms;
 };
 
+struct binder_frozen_status_info {
+	__u32            pid;
+	__u32            sync_recv;
+	__u32            async_recv;
+};
+
 #define BINDER_WRITE_READ		_IOWR('b', 1, struct binder_write_read)
 #define BINDER_SET_IDLE_TIMEOUT		_IOW('b', 3, __s64)
 #define BINDER_SET_MAX_THREADS		_IOW('b', 5, __u32)
@@ -234,6 +240,7 @@ struct binder_freeze_info {
 #define BINDER_GET_NODE_INFO_FOR_REF	_IOWR('b', 12, struct binder_node_info_for_ref)
 #define BINDER_SET_CONTEXT_MGR_EXT	_IOW('b', 13, struct flat_binder_object)
 #define BINDER_FREEZE			_IOW('b', 14, struct binder_freeze_info)
+#define BINDER_GET_FROZEN_INFO		_IOWR('b', 15, struct binder_frozen_status_info)
 
 /*
  * NOTE: Two special error codes you should check for when calling
-- 
cgit v1.2.3-71-gd317


From 3c85a8b81cc89c712c4f44cb1a4d5547e472fb1f Mon Sep 17 00:00:00 2001
From: Cooper Lees <me@cooperlees.com>
Date: Tue, 23 Mar 2021 20:47:38 -0700
Subject: Add Open Routing Protocol ID to `rtnetlink.h`

- The Open Routing (Open/R) network protocol netlink handler uses ID 99
- Will also add to `/etc/iproute2/rt_protos` once this is accepted
- For more information: https://github.com/facebook/openr
Signed-off-by: From: Cooper Lees <me@cooperlees.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index d35953bc7d53..5888492a5257 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -290,6 +290,7 @@ enum {
 #define RTPROT_MROUTED		17	/* Multicast daemon */
 #define RTPROT_KEEPALIVED	18	/* Keepalived daemon */
 #define RTPROT_BABEL		42	/* Babel daemon */
+#define RTPROT_OPENR		99	/* Open Routing (Open/R) Routes */
 #define RTPROT_BGP		186	/* BGP Routes */
 #define RTPROT_ISIS		187	/* ISIS Routes */
 #define RTPROT_OSPF		188	/* OSPF Routes */
-- 
cgit v1.2.3-71-gd317


From e43accba9b071dcd106b5e7643b1b106a158cbb1 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 24 Mar 2021 21:43:32 +0200
Subject: psample: Fix user API breakage

Cited commit added a new attribute before the existing group reference
count attribute, thereby changing its value and breaking existing
applications on new kernels.

Before:

 # psample -l
 libpsample ERROR psample_group_foreach: failed to recv message: Operation not supported

After:

 # psample -l
 Group Num       Refcount        Group Seq
 1               1               0

Fix by restoring the value of the old attribute and remove the
misleading comments from the enumerator to avoid future bugs.

Cc: stable@vger.kernel.org
Fixes: d8bed686ab96 ("net: psample: Add tunnel support")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reported-by: Adiel Bidani <adielb@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/psample.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h
index aea26ab1431c..bff5032c98df 100644
--- a/include/uapi/linux/psample.h
+++ b/include/uapi/linux/psample.h
@@ -3,7 +3,6 @@
 #define __UAPI_PSAMPLE_H
 
 enum {
-	/* sampled packet metadata */
 	PSAMPLE_ATTR_IIFINDEX,
 	PSAMPLE_ATTR_OIFINDEX,
 	PSAMPLE_ATTR_ORIGSIZE,
@@ -11,10 +10,8 @@ enum {
 	PSAMPLE_ATTR_GROUP_SEQ,
 	PSAMPLE_ATTR_SAMPLE_RATE,
 	PSAMPLE_ATTR_DATA,
-	PSAMPLE_ATTR_TUNNEL,
-
-	/* commands attributes */
 	PSAMPLE_ATTR_GROUP_REFCOUNT,
+	PSAMPLE_ATTR_TUNNEL,
 
 	__PSAMPLE_ATTR_MAX
 };
-- 
cgit v1.2.3-71-gd317


From ed3038158e7b58dcf966cb7ec4ae98d152a5b794 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Mar 2021 18:11:55 -0700
Subject: ethtool: fec: fix typo in kdoc

s/porte/the port/

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index cde753bb2093..1433d6278018 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1378,7 +1378,7 @@ struct ethtool_per_queue_op {
 /**
  * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters
  * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM
- * @active_fec: FEC mode which is active on porte
+ * @active_fec: FEC mode which is active on the port
  * @fec: Bitmask of supported/configured FEC modes
  * @rsvd: Reserved for future extensions. i.e FEC bypass feature.
  *
-- 
cgit v1.2.3-71-gd317


From 408386817a9d32c88c9ac528749e9999d0e3f6a1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Mar 2021 18:11:56 -0700
Subject: ethtool: fec: remove long structure description

Digging through the mailing list archive @autoneg was part
of the first version of the RFC, this left over comment was
pointed out twice in review but wasn't removed.

The sentence is an exact copy-paste from pauseparam.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 1433d6278018..36bf435d232c 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1381,10 +1381,6 @@ struct ethtool_per_queue_op {
  * @active_fec: FEC mode which is active on the port
  * @fec: Bitmask of supported/configured FEC modes
  * @rsvd: Reserved for future extensions. i.e FEC bypass feature.
- *
- * Drivers should reject a non-zero setting of @autoneg when
- * autoneogotiation is disabled (or not supported) for the link.
- *
  */
 struct ethtool_fecparam {
 	__u32   cmd;
-- 
cgit v1.2.3-71-gd317


From 240e114411e74d2ee8121643e0c67717eb7c6982 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Mar 2021 18:11:57 -0700
Subject: ethtool: fec: sanitize ethtool_fecparam->reserved

struct ethtool_fecparam::reserved is never looked at by the core.
Make sure it's actually 0. Unfortunately we can't return an error
because old ethtool doesn't zero-initialize the structure for SET.
On GET we can be more verbose, there are no in tree (ab)users.

Fix up the kdoc on the structure. Remove the mention of FEC
bypass. Seems like a niche thing to configure in the first
place.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 2 +-
 net/ethtool/ioctl.c          | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 36bf435d232c..9e2682a67460 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1380,7 +1380,7 @@ struct ethtool_per_queue_op {
  * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM
  * @active_fec: FEC mode which is active on the port
  * @fec: Bitmask of supported/configured FEC modes
- * @rsvd: Reserved for future extensions. i.e FEC bypass feature.
+ * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET.
  */
 struct ethtool_fecparam {
 	__u32   cmd;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 0788cc3b3114..be3549023d89 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -2568,6 +2568,9 @@ static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr)
 	if (rc)
 		return rc;
 
+	if (WARN_ON_ONCE(fecparam.reserved))
+		fecparam.reserved = 0;
+
 	if (copy_to_user(useraddr, &fecparam, sizeof(fecparam)))
 		return -EFAULT;
 	return 0;
@@ -2583,6 +2586,8 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr)
 	if (copy_from_user(&fecparam, useraddr, sizeof(fecparam)))
 		return -EFAULT;
 
+	fecparam.reserved = 0;
+
 	return dev->ethtool_ops->set_fecparam(dev, &fecparam);
 }
 
-- 
cgit v1.2.3-71-gd317


From d3b37fc805d9ef697451730ebdfc7e35e6c2ace8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Mar 2021 18:11:58 -0700
Subject: ethtool: fec: sanitize ethtool_fecparam->active_fec

struct ethtool_fecparam::active_fec is a GET-only field,
all in-tree drivers correctly ignore it on SET. Clear
the field on SET to avoid any confusion. Again, we can't
reject non-zero now since ethtool user space does not
zero-init the param correctly.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 2 +-
 net/ethtool/ioctl.c          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 9e2682a67460..517b68c5fcec 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1378,7 +1378,7 @@ struct ethtool_per_queue_op {
 /**
  * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters
  * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM
- * @active_fec: FEC mode which is active on the port
+ * @active_fec: FEC mode which is active on the port, GET only.
  * @fec: Bitmask of supported/configured FEC modes
  * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET.
  */
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index be3549023d89..237ffe5440ef 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -2586,6 +2586,7 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr)
 	if (copy_from_user(&fecparam, useraddr, sizeof(fecparam)))
 		return -EFAULT;
 
+	fecparam.active_fec = 0;
 	fecparam.reserved = 0;
 
 	return dev->ethtool_ops->set_fecparam(dev, &fecparam);
-- 
cgit v1.2.3-71-gd317


From 6dbf94b264e62641d521975d0eddbeef36bacf3c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Mar 2021 18:12:00 -0700
Subject: ethtool: clarify the ethtool FEC interface

The definition of the FEC driver interface is quite unclear.
Improve the documentation.

This is based on current driver and user space code, as well
as the discussions about the interface:

RFC v1 (24 Oct 2016): https://lore.kernel.org/netdev/1477363849-36517-1-git-send-email-vidya@cumulusnetworks.com/
 - this version has the autoneg field
 - no active_fec field
 - none vs off confusion is already present

RFC v2 (10 Feb 2017): https://lore.kernel.org/netdev/1486727004-11316-1-git-send-email-vidya@cumulusnetworks.com/
 - autoneg removed
 - active_fec added

v1 (10 Feb 2017): https://lore.kernel.org/netdev/1486751311-42019-1-git-send-email-vidya@cumulusnetworks.com/
 - no changes in the code

v1 (24 Jun 2017):  https://lore.kernel.org/netdev/1498331985-8525-1-git-send-email-roopa@cumulusnetworks.com/
 - include in tree user

v2 (27 Jul 2017): https://lore.kernel.org/netdev/1501199248-24695-1-git-send-email-roopa@cumulusnetworks.com/

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 517b68c5fcec..f6ef7d42c7a1 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1376,11 +1376,29 @@ struct ethtool_per_queue_op {
 };
 
 /**
- * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters
+ * struct ethtool_fecparam - Ethernet Forward Error Correction parameters
  * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM
- * @active_fec: FEC mode which is active on the port, GET only.
- * @fec: Bitmask of supported/configured FEC modes
+ * @active_fec: FEC mode which is active on the port, single bit set, GET only.
+ * @fec: Bitmask of configured FEC modes.
  * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET.
+ *
+ * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS.
+ * FEC settings are configured by link autonegotiation whenever it's enabled.
+ * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode.
+ *
+ * When autoneg is disabled %ETHTOOL_SFECPARAM controls the FEC settings.
+ * It is recommended that drivers only accept a single bit set in @fec.
+ * When multiple bits are set in @fec drivers may pick mode in an implementation
+ * dependent way. Drivers should reject mixing %ETHTOOL_FEC_AUTO_BIT with other
+ * FEC modes, because it's unclear whether in this case other modes constrain
+ * AUTO or are independent choices.
+ * Drivers must reject SET requests if they support none of the requested modes.
+ *
+ * If device does not support FEC drivers may use %ETHTOOL_FEC_NONE instead
+ * of returning %EOPNOTSUPP from %ETHTOOL_GFECPARAM.
+ *
+ * See enum ethtool_fec_config_bits for definition of valid bits for both
+ * @fec and @active_fec.
  */
 struct ethtool_fecparam {
 	__u32   cmd;
@@ -1392,11 +1410,16 @@ struct ethtool_fecparam {
 
 /**
  * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration
- * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported
- * @ETHTOOL_FEC_AUTO: Default/Best FEC mode provided by driver
+ * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported. Should not
+ *		      be used together with other bits. GET only.
+ * @ETHTOOL_FEC_AUTO: Select default/best FEC mode automatically, usually based
+ *		      link mode and SFP parameters read from module's EEPROM.
+ *		      This bit does _not_ mean autonegotiation.
  * @ETHTOOL_FEC_OFF: No FEC Mode
- * @ETHTOOL_FEC_RS: Reed-Solomon Forward Error Detection mode
- * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon Forward Error Detection mode
+ * @ETHTOOL_FEC_RS: Reed-Solomon FEC Mode
+ * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon FEC Mode
+ * @ETHTOOL_FEC_LLRS: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet
+ *		      Consortium)
  */
 enum ethtool_fec_config_bits {
 	ETHTOOL_FEC_NONE_BIT,
-- 
cgit v1.2.3-71-gd317


From 8b638081bd4520f63db1defc660666ec5f65bc15 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 12 Mar 2021 09:07:30 -0500
Subject: dm ioctl: return UUID in DM_LIST_DEVICES_CMD result

When LVM needs to find a device with a particular UUID it needs to ask for
UUID for each device. This patch returns UUID directly in the list of
devices, so that LVM doesn't have to query all the devices with an ioctl.
The UUID is returned if the flag DM_UUID_FLAG is set in the parameters.

Returning UUID is done in backward-compatible way. There's one unused
32-bit word value after the event number. This patch sets the bit
DM_NAME_LIST_FLAG_HAS_UUID if UUID is present and
DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID if it isn't (if none of these bits is
set, then we have an old kernel that doesn't support returning UUIDs). The
UUID is stored after this word. The 'next' value is updated to point after
the UUID, so that old version of libdevmapper will skip the UUID without
attempting to interpret it.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ioctl.c         | 20 +++++++++++++++++---
 include/uapi/linux/dm-ioctl.h | 18 ++++++++++++++++--
 2 files changed, 33 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 272de8772d52..0812ac6e9d70 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -558,7 +558,9 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_
 	for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) {
 		hc = container_of(n, struct hash_cell, name_node);
 		needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1);
-		needed += align_val(sizeof(uint32_t));
+		needed += align_val(sizeof(uint32_t) * 2);
+		if (param->flags & DM_UUID_FLAG && hc->uuid)
+			needed += align_val(strlen(hc->uuid) + 1);
 	}
 
 	/*
@@ -577,6 +579,7 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_
 	 * Now loop through filling out the names.
 	 */
 	for (n = rb_first(&name_rb_tree); n; n = rb_next(n)) {
+		void *uuid_ptr;
 		hc = container_of(n, struct hash_cell, name_node);
 		if (old_nl)
 			old_nl->next = (uint32_t) ((void *) nl -
@@ -588,8 +591,19 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_
 
 		old_nl = nl;
 		event_nr = align_ptr(nl->name + strlen(hc->name) + 1);
-		*event_nr = dm_get_event_nr(hc->md);
-		nl = align_ptr(event_nr + 1);
+		event_nr[0] = dm_get_event_nr(hc->md);
+		event_nr[1] = 0;
+		uuid_ptr = align_ptr(event_nr + 2);
+		if (param->flags & DM_UUID_FLAG) {
+			if (hc->uuid) {
+				event_nr[1] |= DM_NAME_LIST_FLAG_HAS_UUID;
+				strcpy(uuid_ptr, hc->uuid);
+				uuid_ptr = align_ptr(uuid_ptr + strlen(hc->uuid) + 1);
+			} else {
+				event_nr[1] |= DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID;
+			}
+		}
+		nl = uuid_ptr;
 	}
 	/*
 	 * If mismatch happens, security may be compromised due to buffer
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index fcff6669137b..e5c6e458bdf7 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -193,8 +193,22 @@ struct dm_name_list {
 	__u32 next;		/* offset to the next record from
 				   the _start_ of this */
 	char name[0];
+
+	/*
+	 * The following members can be accessed by taking a pointer that
+	 * points immediately after the terminating zero character in "name"
+	 * and aligning this pointer to next 8-byte boundary.
+	 * Uuid is present if the flag DM_NAME_LIST_FLAG_HAS_UUID is set.
+	 *
+	 * __u32 event_nr;
+	 * __u32 flags;
+	 * char uuid[0];
+	 */
 };
 
+#define DM_NAME_LIST_FLAG_HAS_UUID		1
+#define DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID	2
+
 /*
  * Used to retrieve the target versions
  */
@@ -272,9 +286,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	44
+#define DM_VERSION_MINOR	45
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2021-02-01)"
+#define DM_VERSION_EXTRA	"-ioctl (2021-03-22)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3-71-gd317


From ad1cd7856d870e5861ef80fbf3e4b0d68bb82a69 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 26 Mar 2021 13:22:21 -0700
Subject: ethtool: fec: add note about reuse of reserved

struct ethtool_fecparam::reserved can't be used in SET, because
ethtool user space doesn't zero-initialize the structure.
Make this clear.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f6ef7d42c7a1..9a47c3efd8ca 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1382,6 +1382,10 @@ struct ethtool_per_queue_op {
  * @fec: Bitmask of configured FEC modes.
  * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET.
  *
+ * Note that @reserved was never validated on input and ethtool user space
+ * left it uninitialized when calling SET. Hence going forward it can only be
+ * used to return a value to userspace with GET.
+ *
  * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS.
  * FEC settings are configured by link autonegotiation whenever it's enabled.
  * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode.
-- 
cgit v1.2.3-71-gd317


From d04feecaf1543e538e856166e494daebe808d1fe Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 26 Mar 2021 13:22:23 -0700
Subject: ethtool: document the enum values not defines

kdoc does not have good support for documenting defines,
and we can't abuse the enum documentation because it
generates warnings.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 9a47c3efd8ca..868b513d4f54 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1414,16 +1414,16 @@ struct ethtool_fecparam {
 
 /**
  * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration
- * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported. Should not
- *		      be used together with other bits. GET only.
- * @ETHTOOL_FEC_AUTO: Select default/best FEC mode automatically, usually based
- *		      link mode and SFP parameters read from module's EEPROM.
- *		      This bit does _not_ mean autonegotiation.
- * @ETHTOOL_FEC_OFF: No FEC Mode
- * @ETHTOOL_FEC_RS: Reed-Solomon FEC Mode
- * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon FEC Mode
- * @ETHTOOL_FEC_LLRS: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet
- *		      Consortium)
+ * @ETHTOOL_FEC_NONE_BIT: FEC mode configuration is not supported. Should not
+ *			be used together with other bits. GET only.
+ * @ETHTOOL_FEC_AUTO_BIT: Select default/best FEC mode automatically, usually
+ *			based link mode and SFP parameters read from module's
+ *			EEPROM. This bit does _not_ mean autonegotiation.
+ * @ETHTOOL_FEC_OFF_BIT: No FEC Mode
+ * @ETHTOOL_FEC_RS_BIT: Reed-Solomon FEC Mode
+ * @ETHTOOL_FEC_BASER_BIT: Base-R/Reed-Solomon FEC Mode
+ * @ETHTOOL_FEC_LLRS_BIT: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet
+ *			Consortium)
  */
 enum ethtool_fec_config_bits {
 	ETHTOOL_FEC_NONE_BIT,
-- 
cgit v1.2.3-71-gd317


From e6ac2450d6dee3121cd8bbf2907b78a68a8a353d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:51:42 -0700
Subject: bpf: Support bpf program calling kernel function

This patch adds support to BPF verifier to allow bpf program calling
kernel function directly.

The use case included in this set is to allow bpf-tcp-cc to directly
call some tcp-cc helper functions (e.g. "tcp_cong_avoid_ai()").  Those
functions have already been used by some kernel tcp-cc implementations.

This set will also allow the bpf-tcp-cc program to directly call the
kernel tcp-cc implementation,  For example, a bpf_dctcp may only want to
implement its own dctcp_cwnd_event() and reuse other dctcp_*() directly
from the kernel tcp_dctcp.c instead of reimplementing (or
copy-and-pasting) them.

The tcp-cc kernel functions mentioned above will be white listed
for the struct_ops bpf-tcp-cc programs to use in a later patch.
The white listed functions are not bounded to a fixed ABI contract.
Those functions have already been used by the existing kernel tcp-cc.
If any of them has changed, both in-tree and out-of-tree kernel tcp-cc
implementations have to be changed.  The same goes for the struct_ops
bpf-tcp-cc programs which have to be adjusted accordingly.

This patch is to make the required changes in the bpf verifier.

First change is in btf.c, it adds a case in "btf_check_func_arg_match()".
When the passed in "btf->kernel_btf == true", it means matching the
verifier regs' states with a kernel function.  This will handle the
PTR_TO_BTF_ID reg.  It also maps PTR_TO_SOCK_COMMON, PTR_TO_SOCKET,
and PTR_TO_TCP_SOCK to its kernel's btf_id.

In the later libbpf patch, the insn calling a kernel function will
look like:

insn->code == (BPF_JMP | BPF_CALL)
insn->src_reg == BPF_PSEUDO_KFUNC_CALL /* <- new in this patch */
insn->imm == func_btf_id /* btf_id of the running kernel */

[ For the future calling function-in-kernel-module support, an array
  of module btf_fds can be passed at the load time and insn->off
  can be used to index into this array. ]

At the early stage of verifier, the verifier will collect all kernel
function calls into "struct bpf_kfunc_desc".  Those
descriptors are stored in "prog->aux->kfunc_tab" and will
be available to the JIT.  Since this "add" operation is similar
to the current "add_subprog()" and looking for the same insn->code,
they are done together in the new "add_subprog_and_kfunc()".

In the "do_check()" stage, the new "check_kfunc_call()" is added
to verify the kernel function call instruction:
1. Ensure the kernel function can be used by a particular BPF_PROG_TYPE.
   A new bpf_verifier_ops "check_kfunc_call" is added to do that.
   The bpf-tcp-cc struct_ops program will implement this function in
   a later patch.
2. Call "btf_check_kfunc_args_match()" to ensure the regs can be
   used as the args of a kernel function.
3. Mark the regs' type, subreg_def, and zext_dst.

At the later do_misc_fixups() stage, the new fixup_kfunc_call()
will replace the insn->imm with the function address (relative
to __bpf_call_base).  If needed, the jit can find the btf_func_model
by calling the new bpf_jit_find_kfunc_model(prog, insn).
With the imm set to the function address, "bpftool prog dump xlated"
will be able to display the kernel function calls the same way as
it displays other bpf helper calls.

gpl_compatible program is required to call kernel function.

This feature currently requires JIT.

The verifier selftests are adjusted because of the changes in
the verbose log in add_subprog_and_kfunc().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015142.1544736-1-kafai@fb.com
---
 arch/x86/net/bpf_jit_comp.c                      |   5 +
 include/linux/bpf.h                              |  24 ++
 include/linux/btf.h                              |   1 +
 include/linux/filter.h                           |   1 +
 include/uapi/linux/bpf.h                         |   4 +
 kernel/bpf/btf.c                                 |  65 +++-
 kernel/bpf/core.c                                |  18 +-
 kernel/bpf/disasm.c                              |  13 +-
 kernel/bpf/syscall.c                             |   1 +
 kernel/bpf/verifier.c                            | 368 +++++++++++++++++++++--
 tools/include/uapi/linux/bpf.h                   |   4 +
 tools/testing/selftests/bpf/verifier/calls.c     |  12 +-
 tools/testing/selftests/bpf/verifier/dead_code.c |  10 +-
 13 files changed, 480 insertions(+), 46 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b35fc8023884..9eead60f0301 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2346,3 +2346,8 @@ out:
 					   tmp : orig_prog);
 	return prog;
 }
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+	return true;
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eaae618a90b5..b5b7967e3ff3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -427,6 +427,7 @@ enum bpf_reg_type {
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
 	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
+	__BPF_REG_TYPE_MAX,
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -480,6 +481,7 @@ struct bpf_verifier_ops {
 				 const struct btf_type *t, int off, int size,
 				 enum bpf_access_type atype,
 				 u32 *next_btf_id);
+	bool (*check_kfunc_call)(u32 kfunc_btf_id);
 };
 
 struct bpf_prog_offload_ops {
@@ -796,6 +798,8 @@ struct btf_mod_pair {
 	struct module *module;
 };
 
+struct bpf_kfunc_desc_tab;
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
@@ -832,6 +836,7 @@ struct bpf_prog_aux {
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
 	struct bpf_jit_poke_descriptor *poke_tab;
+	struct bpf_kfunc_desc_tab *kfunc_tab;
 	u32 size_poke_tab;
 	struct bpf_ksym ksym;
 	const struct bpf_prog_ops *ops;
@@ -1547,6 +1552,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 struct bpf_reg_state;
 int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+			      const struct btf *btf, u32 func_id,
+			      struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			  struct bpf_reg_state *reg);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
@@ -1557,6 +1565,10 @@ struct bpf_link *bpf_link_by_id(u32 id);
 
 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
 void bpf_task_storage_free(struct task_struct *task);
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+			 const struct bpf_insn *insn);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -1737,6 +1749,18 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 static inline void bpf_task_storage_free(struct task_struct *task)
 {
 }
+
+static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+	return false;
+}
+
+static inline const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+			 const struct bpf_insn *insn)
+{
+	return NULL;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 8a05687a4ee2..3bac66e0183a 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -110,6 +110,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
 const struct btf_type *
 btf_resolve_size(const struct btf *btf, const struct btf_type *type,
 		 u32 *type_size);
+const char *btf_type_str(const struct btf_type *t);
 
 #define for_each_member(i, struct_type, member)			\
 	for (i = 0, member = btf_type_member(struct_type);	\
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0d9c710eb050..eecfd82db648 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -918,6 +918,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_jit_needs_zext(void);
+bool bpf_jit_supports_kfunc_call(void);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(const struct cred *cred)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 008edc1dc8c1..598716742593 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1117,6 +1117,10 @@ enum bpf_link_type {
  * offset to another bpf function
  */
 #define BPF_PSEUDO_CALL		1
+/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL,
+ * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel
+ */
+#define BPF_PSEUDO_KFUNC_CALL	2
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
 enum {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3c489adacf3b..ec8afc4bc560 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -283,7 +283,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_FLOAT]	= "FLOAT",
 };
 
-static const char *btf_type_str(const struct btf_type *t)
+const char *btf_type_str(const struct btf_type *t)
 {
 	return btf_kind_str[BTF_INFO_KIND(t->info)];
 }
@@ -5362,6 +5362,14 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+	[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+	[PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+	[PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
@@ -5371,12 +5379,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 	const char *func_name, *ref_tname;
 	const struct btf_type *t, *ref_t;
 	const struct btf_param *args;
-	u32 i, nargs;
+	u32 i, nargs, ref_id;
 
 	t = btf_type_by_id(btf, func_id);
 	if (!t || !btf_type_is_func(t)) {
 		/* These checks were already done by the verifier while loading
-		 * struct bpf_func_info
+		 * struct bpf_func_info or in add_kfunc_call().
 		 */
 		bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
 			func_id);
@@ -5418,9 +5426,49 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 
-		ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-		if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) {
+		if (btf_is_kernel(btf)) {
+			const struct btf_type *reg_ref_t;
+			const struct btf *reg_btf;
+			const char *reg_ref_tname;
+			u32 reg_ref_id;
+
+			if (!btf_type_is_struct(ref_t)) {
+				bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
+					func_name, i, btf_type_str(ref_t),
+					ref_tname);
+				return -EINVAL;
+			}
+
+			if (reg->type == PTR_TO_BTF_ID) {
+				reg_btf = reg->btf;
+				reg_ref_id = reg->btf_id;
+			} else if (reg2btf_ids[reg->type]) {
+				reg_btf = btf_vmlinux;
+				reg_ref_id = *reg2btf_ids[reg->type];
+			} else {
+				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
+					func_name, i,
+					btf_type_str(ref_t), ref_tname, regno);
+				return -EINVAL;
+			}
+
+			reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
+							    &reg_ref_id);
+			reg_ref_tname = btf_name_by_offset(reg_btf,
+							   reg_ref_t->name_off);
+			if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
+						  reg->off, btf, ref_id)) {
+				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+					func_name, i,
+					btf_type_str(ref_t), ref_tname,
+					regno, btf_type_str(reg_ref_t),
+					reg_ref_tname);
+				return -EINVAL;
+			}
+		} else if (btf_get_prog_ctx_type(log, btf, t,
+						 env->prog->type, i)) {
 			/* If function expects ctx type in BTF check that caller
 			 * is passing PTR_TO_CTX.
 			 */
@@ -5493,6 +5541,13 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 	return err;
 }
 
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+			      const struct btf *btf, u32 func_id,
+			      struct bpf_reg_state *regs)
+{
+	return btf_check_func_arg_match(env, btf, func_id, regs, false);
+}
+
 /* Convert BTF of a function into bpf_reg_state if possible
  * Returns:
  * EFAULT - there is a verifier bug. Abort verification.
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a35eb3d7b126..f5423251c118 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -159,6 +159,9 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
 		kvfree(prog->aux->jited_linfo);
 		prog->aux->jited_linfo = NULL;
 	}
+
+	kfree(prog->aux->kfunc_tab);
+	prog->aux->kfunc_tab = NULL;
 }
 
 /* The jit engine is responsible to provide an array
@@ -1840,9 +1843,15 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	/* In case of BPF to BPF calls, verifier did all the prep
 	 * work with regards to JITing, etc.
 	 */
+	bool jit_needed = false;
+
 	if (fp->bpf_func)
 		goto finalize;
 
+	if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
+	    bpf_prog_has_kfunc_call(fp))
+		jit_needed = true;
+
 	bpf_prog_select_func(fp);
 
 	/* eBPF JITs can rewrite the program in case constant
@@ -1858,12 +1867,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 
 		fp = bpf_int_jit_compile(fp);
 		bpf_prog_jit_attempt_done(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
-		if (!fp->jited) {
+		if (!fp->jited && jit_needed) {
 			*err = -ENOTSUPP;
 			return fp;
 		}
-#endif
 	} else {
 		*err = bpf_prog_offload_compile(fp);
 		if (*err)
@@ -2343,6 +2350,11 @@ bool __weak bpf_jit_needs_zext(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_kfunc_call(void)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 3acc7e0b6916..dad821c8ecd0 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
 {
 	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
 
-	if (insn->src_reg != BPF_PSEUDO_CALL &&
+	if (!insn->src_reg &&
 	    insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
 	    func_id_str[insn->imm])
 		return func_id_str[insn->imm];
 
-	if (cbs && cbs->cb_call)
-		return cbs->cb_call(cbs->private_data, insn);
+	if (cbs && cbs->cb_call) {
+		const char *res;
+
+		res = cbs->cb_call(cbs->private_data, insn);
+		if (res)
+			return res;
+	}
 
 	if (insn->src_reg == BPF_PSEUDO_CALL)
 		snprintf(buff, len, "%+d", insn->imm);
+	else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+		snprintf(buff, len, "kernel-function");
 
 	return buff;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eaf85bf51c5a..9603de81811a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1696,6 +1696,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 	btf_put(prog->aux->btf);
 	kvfree(prog->aux->jited_linfo);
 	kvfree(prog->aux->linfo);
+	kfree(prog->aux->kfunc_tab);
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b31e62daafbd..852541a435ef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn)
 	       insn->src_reg == BPF_PSEUDO_CALL;
 }
 
+static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_CALL) &&
+	       insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
+}
+
 static bool bpf_pseudo_func(const struct bpf_insn *insn)
 {
 	return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
@@ -1554,47 +1560,205 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
+	/* determine subprog starts. The end is one before the next starts */
 	env->subprog_info[env->subprog_cnt++].start = off;
 	sort(env->subprog_info, env->subprog_cnt,
 	     sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
 	return env->subprog_cnt - 1;
 }
 
-static int check_subprogs(struct bpf_verifier_env *env)
+struct bpf_kfunc_desc {
+	struct btf_func_model func_model;
+	u32 func_id;
+	s32 imm;
+};
+
+#define MAX_KFUNC_DESCS 256
+struct bpf_kfunc_desc_tab {
+	struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
+	u32 nr_descs;
+};
+
+static int kfunc_desc_cmp_by_id(const void *a, const void *b)
+{
+	const struct bpf_kfunc_desc *d0 = a;
+	const struct bpf_kfunc_desc *d1 = b;
+
+	/* func_id is not greater than BTF_MAX_TYPE */
+	return d0->func_id - d1->func_id;
+}
+
+static const struct bpf_kfunc_desc *
+find_kfunc_desc(const struct bpf_prog *prog, u32 func_id)
+{
+	struct bpf_kfunc_desc desc = {
+		.func_id = func_id,
+	};
+	struct bpf_kfunc_desc_tab *tab;
+
+	tab = prog->aux->kfunc_tab;
+	return bsearch(&desc, tab->descs, tab->nr_descs,
+		       sizeof(tab->descs[0]), kfunc_desc_cmp_by_id);
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
+{
+	const struct btf_type *func, *func_proto;
+	struct bpf_kfunc_desc_tab *tab;
+	struct bpf_prog_aux *prog_aux;
+	struct bpf_kfunc_desc *desc;
+	const char *func_name;
+	unsigned long addr;
+	int err;
+
+	prog_aux = env->prog->aux;
+	tab = prog_aux->kfunc_tab;
+	if (!tab) {
+		if (!btf_vmlinux) {
+			verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
+			return -ENOTSUPP;
+		}
+
+		if (!env->prog->jit_requested) {
+			verbose(env, "JIT is required for calling kernel function\n");
+			return -ENOTSUPP;
+		}
+
+		if (!bpf_jit_supports_kfunc_call()) {
+			verbose(env, "JIT does not support calling kernel function\n");
+			return -ENOTSUPP;
+		}
+
+		if (!env->prog->gpl_compatible) {
+			verbose(env, "cannot call kernel function from non-GPL compatible program\n");
+			return -EINVAL;
+		}
+
+		tab = kzalloc(sizeof(*tab), GFP_KERNEL);
+		if (!tab)
+			return -ENOMEM;
+		prog_aux->kfunc_tab = tab;
+	}
+
+	if (find_kfunc_desc(env->prog, func_id))
+		return 0;
+
+	if (tab->nr_descs == MAX_KFUNC_DESCS) {
+		verbose(env, "too many different kernel function calls\n");
+		return -E2BIG;
+	}
+
+	func = btf_type_by_id(btf_vmlinux, func_id);
+	if (!func || !btf_type_is_func(func)) {
+		verbose(env, "kernel btf_id %u is not a function\n",
+			func_id);
+		return -EINVAL;
+	}
+	func_proto = btf_type_by_id(btf_vmlinux, func->type);
+	if (!func_proto || !btf_type_is_func_proto(func_proto)) {
+		verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
+			func_id);
+		return -EINVAL;
+	}
+
+	func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+	addr = kallsyms_lookup_name(func_name);
+	if (!addr) {
+		verbose(env, "cannot find address for kernel function %s\n",
+			func_name);
+		return -EINVAL;
+	}
+
+	desc = &tab->descs[tab->nr_descs++];
+	desc->func_id = func_id;
+	desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base;
+	err = btf_distill_func_proto(&env->log, btf_vmlinux,
+				     func_proto, func_name,
+				     &desc->func_model);
+	if (!err)
+		sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+		     kfunc_desc_cmp_by_id, NULL);
+	return err;
+}
+
+static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+{
+	const struct bpf_kfunc_desc *d0 = a;
+	const struct bpf_kfunc_desc *d1 = b;
+
+	if (d0->imm > d1->imm)
+		return 1;
+	else if (d0->imm < d1->imm)
+		return -1;
+	return 0;
+}
+
+static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+{
+	struct bpf_kfunc_desc_tab *tab;
+
+	tab = prog->aux->kfunc_tab;
+	if (!tab)
+		return;
+
+	sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+	     kfunc_desc_cmp_by_imm, NULL);
+}
+
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+	return !!prog->aux->kfunc_tab;
+}
+
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+			 const struct bpf_insn *insn)
+{
+	const struct bpf_kfunc_desc desc = {
+		.imm = insn->imm,
+	};
+	const struct bpf_kfunc_desc *res;
+	struct bpf_kfunc_desc_tab *tab;
+
+	tab = prog->aux->kfunc_tab;
+	res = bsearch(&desc, tab->descs, tab->nr_descs,
+		      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+
+	return res ? &res->func_model : NULL;
+}
+
+static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 {
-	int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
 	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
+	int i, ret, insn_cnt = env->prog->len;
 
 	/* Add entry function. */
 	ret = add_subprog(env, 0);
-	if (ret < 0)
+	if (ret)
 		return ret;
 
-	/* determine subprog starts. The end is one before the next starts */
-	for (i = 0; i < insn_cnt; i++) {
-		if (bpf_pseudo_func(insn + i)) {
-			if (!env->bpf_capable) {
-				verbose(env,
-					"function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
-				return -EPERM;
-			}
-			ret = add_subprog(env, i + insn[i].imm + 1);
-			if (ret < 0)
-				return ret;
-			/* remember subprog */
-			insn[i + 1].imm = ret;
-			continue;
-		}
-		if (!bpf_pseudo_call(insn + i))
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
+		    !bpf_pseudo_kfunc_call(insn))
 			continue;
+
 		if (!env->bpf_capable) {
-			verbose(env,
-				"function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+			verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
 			return -EPERM;
 		}
-		ret = add_subprog(env, i + insn[i].imm + 1);
+
+		if (bpf_pseudo_func(insn)) {
+			ret = add_subprog(env, i + insn->imm + 1);
+			if (ret >= 0)
+				/* remember subprog */
+				insn[1].imm = ret;
+		} else if (bpf_pseudo_call(insn)) {
+			ret = add_subprog(env, i + insn->imm + 1);
+		} else {
+			ret = add_kfunc_call(env, insn->imm);
+		}
+
 		if (ret < 0)
 			return ret;
 	}
@@ -1608,6 +1772,16 @@ static int check_subprogs(struct bpf_verifier_env *env)
 		for (i = 0; i < env->subprog_cnt; i++)
 			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
+	return 0;
+}
+
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+	int i, subprog_start, subprog_end, off, cur_subprog = 0;
+	struct bpf_subprog_info *subprog = env->subprog_info;
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+
 	/* now check that all jumps are within the same subprog */
 	subprog_start = subprog[cur_subprog].start;
 	subprog_end = subprog[cur_subprog + 1].start;
@@ -1916,6 +2090,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
 	return i;
 }
 
+static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
+{
+	const struct btf_type *func;
+
+	if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
+		return NULL;
+
+	func = btf_type_by_id(btf_vmlinux, insn->imm);
+	return btf_name_by_offset(btf_vmlinux, func->name_off);
+}
+
 /* For given verifier state backtrack_insn() is called from the last insn to
  * the first insn. Its purpose is to compute a bitmask of registers and
  * stack slots that needs precision in the parent verifier state.
@@ -1924,6 +2109,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 			  u32 *reg_mask, u64 *stack_mask)
 {
 	const struct bpf_insn_cbs cbs = {
+		.cb_call	= disasm_kfunc_name,
 		.cb_print	= verbose,
 		.private_data	= env,
 	};
@@ -5960,6 +6146,98 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	return 0;
 }
 
+/* mark_btf_func_reg_size() is used when the reg size is determined by
+ * the BTF func_proto's return value size and argument.
+ */
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+				   size_t reg_size)
+{
+	struct bpf_reg_state *reg = &cur_regs(env)[regno];
+
+	if (regno == BPF_REG_0) {
+		/* Function return value */
+		reg->live |= REG_LIVE_WRITTEN;
+		reg->subreg_def = reg_size == sizeof(u64) ?
+			DEF_NOT_SUBREG : env->insn_idx + 1;
+	} else {
+		/* Function argument */
+		if (reg_size == sizeof(u64)) {
+			mark_insn_zext(env, reg);
+			mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+		} else {
+			mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
+		}
+	}
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+	const struct btf_type *t, *func, *func_proto, *ptr_type;
+	struct bpf_reg_state *regs = cur_regs(env);
+	const char *func_name, *ptr_type_name;
+	u32 i, nargs, func_id, ptr_type_id;
+	const struct btf_param *args;
+	int err;
+
+	func_id = insn->imm;
+	func = btf_type_by_id(btf_vmlinux, func_id);
+	func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+	func_proto = btf_type_by_id(btf_vmlinux, func->type);
+
+	if (!env->ops->check_kfunc_call ||
+	    !env->ops->check_kfunc_call(func_id)) {
+		verbose(env, "calling kernel function %s is not allowed\n",
+			func_name);
+		return -EACCES;
+	}
+
+	/* Check the arguments */
+	err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs);
+	if (err)
+		return err;
+
+	for (i = 0; i < CALLER_SAVED_REGS; i++)
+		mark_reg_not_init(env, regs, caller_saved[i]);
+
+	/* Check return type */
+	t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL);
+	if (btf_type_is_scalar(t)) {
+		mark_reg_unknown(env, regs, BPF_REG_0);
+		mark_btf_func_reg_size(env, BPF_REG_0, t->size);
+	} else if (btf_type_is_ptr(t)) {
+		ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type,
+						   &ptr_type_id);
+		if (!btf_type_is_struct(ptr_type)) {
+			ptr_type_name = btf_name_by_offset(btf_vmlinux,
+							   ptr_type->name_off);
+			verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
+				func_name, btf_type_str(ptr_type),
+				ptr_type_name);
+			return -EINVAL;
+		}
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].btf = btf_vmlinux;
+		regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+		regs[BPF_REG_0].btf_id = ptr_type_id;
+		mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+
+	nargs = btf_type_vlen(func_proto);
+	args = (const struct btf_param *)(func_proto + 1);
+	for (i = 0; i < nargs; i++) {
+		u32 regno = i + 1;
+
+		t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL);
+		if (btf_type_is_ptr(t))
+			mark_btf_func_reg_size(env, regno, sizeof(void *));
+		else
+			/* scalar. ensured by btf_check_kfunc_arg_match() */
+			mark_btf_func_reg_size(env, regno, t->size);
+	}
+
+	return 0;
+}
+
 static bool signed_add_overflows(s64 a, s64 b)
 {
 	/* Do the add in u64, where overflow is well-defined */
@@ -10162,6 +10440,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 		if (env->log.level & BPF_LOG_LEVEL) {
 			const struct bpf_insn_cbs cbs = {
+				.cb_call	= disasm_kfunc_name,
 				.cb_print	= verbose,
 				.private_data	= env,
 			};
@@ -10309,7 +10588,8 @@ static int do_check(struct bpf_verifier_env *env)
 				if (BPF_SRC(insn->code) != BPF_K ||
 				    insn->off != 0 ||
 				    (insn->src_reg != BPF_REG_0 &&
-				     insn->src_reg != BPF_PSEUDO_CALL) ||
+				     insn->src_reg != BPF_PSEUDO_CALL &&
+				     insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
 				    insn->dst_reg != BPF_REG_0 ||
 				    class == BPF_JMP32) {
 					verbose(env, "BPF_CALL uses reserved fields\n");
@@ -10324,6 +10604,8 @@ static int do_check(struct bpf_verifier_env *env)
 				}
 				if (insn->src_reg == BPF_PSEUDO_CALL)
 					err = check_func_call(env, insn, &env->insn_idx);
+				else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+					err = check_kfunc_call(env, insn);
 				else
 					err = check_helper_call(env, insn, &env->insn_idx);
 				if (err)
@@ -11634,6 +11916,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
+		func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
 		func[i]->aux->linfo = prog->aux->linfo;
 		func[i]->aux->nr_linfo = prog->aux->nr_linfo;
 		func[i]->aux->jited_linfo = prog->aux->jited_linfo;
@@ -11773,6 +12056,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	struct bpf_prog *prog = env->prog;
 	struct bpf_insn *insn = prog->insnsi;
+	bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
 	int i, depth;
 #endif
 	int err = 0;
@@ -11786,6 +12070,10 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 			return err;
 	}
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
+	if (has_kfunc_call) {
+		verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
+		return -EINVAL;
+	}
 	if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
 		/* When JIT fails the progs with bpf2bpf calls and tail_calls
 		 * have to be rejected, since interpreter doesn't support them yet.
@@ -11814,6 +12102,26 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 	return err;
 }
 
+static int fixup_kfunc_call(struct bpf_verifier_env *env,
+			    struct bpf_insn *insn)
+{
+	const struct bpf_kfunc_desc *desc;
+
+	/* insn->imm has the btf func_id. Replace it with
+	 * an address (relative to __bpf_base_call).
+	 */
+	desc = find_kfunc_desc(env->prog, insn->imm);
+	if (!desc) {
+		verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
+			insn->imm);
+		return -EFAULT;
+	}
+
+	insn->imm = desc->imm;
+
+	return 0;
+}
+
 /* Do various post-verification rewrites in a single program pass.
  * These rewrites simplify JIT and interpreter implementations.
  */
@@ -11949,6 +12257,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_CALL)
 			continue;
+		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+			ret = fixup_kfunc_call(env, insn);
+			if (ret)
+				return ret;
+			continue;
+		}
 
 		if (insn->imm == BPF_FUNC_get_route_realm)
 			prog->dst_needed = 1;
@@ -12178,6 +12492,8 @@ patch_call_imm:
 		}
 	}
 
+	sort_kfunc_descs_by_imm(env->prog);
+
 	return 0;
 }
 
@@ -12883,6 +13199,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (!env->explored_states)
 		goto skip_full_check;
 
+	ret = add_subprog_and_kfunc(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	ret = check_subprogs(env);
 	if (ret < 0)
 		goto skip_full_check;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2d3036e292a9..ab9f2233607c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1117,6 +1117,10 @@ enum bpf_link_type {
  * offset to another bpf function
  */
 #define BPF_PSEUDO_CALL		1
+/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL,
+ * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel
+ */
+#define BPF_PSEUDO_KFUNC_CALL	2
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
 enum {
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index eb888c8479c3..336a749673d1 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -19,7 +19,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 2),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 1,
@@ -136,7 +136,7 @@
 {
 	"calls: wrong src reg",
 	.insns = {
-	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 3, 0, 0),
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_EXIT_INSN(),
 	},
@@ -397,7 +397,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.fixup_map_hash_48b = { 3 },
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
@@ -1977,7 +1977,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
@@ -2003,7 +2003,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.errstr = "!read_ok",
 	.result = REJECT,
 },
@@ -2028,7 +2028,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.errstr = "!read_ok",
 	.result = REJECT,
 },
diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c
index 5cf361d8eb1c..17fe33a75034 100644
--- a/tools/testing/selftests/bpf/verifier/dead_code.c
+++ b/tools/testing/selftests/bpf/verifier/dead_code.c
@@ -85,7 +85,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 12),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
@@ -103,7 +103,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 12),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
@@ -121,7 +121,7 @@
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
@@ -137,7 +137,7 @@
 	BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
@@ -152,7 +152,7 @@
 	BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
-- 
cgit v1.2.3-71-gd317


From f5076c6ba02e8e24c61c40bbf48078929bc0fc79 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 22 Mar 2021 10:44:54 +0100
Subject: can: uapi: can.h: mark union inside struct can_frame packed

In commit ea7800565a12 ("can: add optional DLC element to Classical
CAN frame structure") the struct can_frame::can_dlc was put into an
anonymous union with another u8 variable.

For various reasons some members in struct can_frame and canfd_frame
including the first 8 byes of data are expected to have the same
memory layout. This is enforced by a BUILD_BUG_ON check in af_can.c.

Since the above mentioned commit this check fails on ARM kernels
compiled with the ARM OABI (which means CONFIG_AEABI not set). In this
case -mabi=apcs-gnu is passed to the compiler, which leads to a
structure size boundary of 32, instead of 8 compared to CONFIG_AEABI
enabled. This means the the union in struct can_frame takes 4 bytes
instead of the expected 1.

Rong Chen illustrates the problem with pahole in the ARM OABI case:

| struct can_frame {
|          canid_t                    can_id;               /* 0     4 */
|          union {
|                  __u8               len;                  /* 4     1 */
|                  __u8               can_dlc;              /* 4     1 */
|          };                                               /* 4     4 */
|          __u8                       __pad;                /* 8     1 */
|          __u8                       __res0;               /* 9     1 */
|          __u8                       len8_dlc;             /* 10    1 */
|
|          /* XXX 5 bytes hole, try to pack */
|
|          __u8                       data[8]
| __attribute__((__aligned__(8))); /*    16     8 */
|
|          /* size: 24, cachelines: 1, members: 6 */
|          /* sum members: 19, holes: 1, sum holes: 5 */
|          /* forced alignments: 1, forced holes: 1, sum forced holes: 5 */
|          /* last cacheline: 24 bytes */
| } __attribute__((__aligned__(8)));

Marking the anonymous union as __attribute__((packed)) fixes the
BUILD_BUG_ON problem on these compilers.

Fixes: ea7800565a12 ("can: add optional DLC element to Classical CAN frame structure")
Reported-by: kernel test robot <lkp@intel.com>
Suggested-by: Rong Chen <rong.a.chen@intel.com>
Link: https://lore.kernel.org/linux-can/2c82ec23-3551-61b5-1bd8-178c3407ee83@hartkopp.net/
Link: https://lore.kernel.org/r/20210325125850.1620-3-socketcan@hartkopp.net
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h
index f75238ac6dce..c7535352fef6 100644
--- a/include/uapi/linux/can.h
+++ b/include/uapi/linux/can.h
@@ -113,7 +113,7 @@ struct can_frame {
 		 */
 		__u8 len;
 		__u8 can_dlc; /* deprecated */
-	};
+	} __attribute__((packed)); /* disable padding added in some ABIs */
 	__u8 __pad; /* padding */
 	__u8 __res0; /* reserved / padding */
 	__u8 len8_dlc; /* optional DLC for 8 byte payload length (9 .. 15) */
-- 
cgit v1.2.3-71-gd317


From 2b246b2569cd2ac6ff700d0dce56b8bae29b1842 Mon Sep 17 00:00:00 2001
From: Andreas Roeseler <andreas.a.roeseler@gmail.com>
Date: Mon, 29 Mar 2021 18:45:15 -0700
Subject: icmp: add support for RFC 8335 PROBE

Add definitions for PROBE ICMP types and codes.

Add AFI definitions for IP and IPV6 as specified by IANA

Add a struct to represent the additional header when probing by IP
address (ctype == 3) for use in parsing incoming PROBE messages

Add a struct to represent the entire Interface Identification Object
(IIO) section of an incoming PROBE packet

Signed-off-by: Andreas Roeseler <andreas.a.roeseler@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/icmp.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h
index fb169a50895e..222325d1d80e 100644
--- a/include/uapi/linux/icmp.h
+++ b/include/uapi/linux/icmp.h
@@ -20,6 +20,9 @@
 
 #include <linux/types.h>
 #include <asm/byteorder.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/in6.h>
 
 #define ICMP_ECHOREPLY		0	/* Echo Reply			*/
 #define ICMP_DEST_UNREACH	3	/* Destination Unreachable	*/
@@ -66,6 +69,23 @@
 #define ICMP_EXC_TTL		0	/* TTL count exceeded		*/
 #define ICMP_EXC_FRAGTIME	1	/* Fragment Reass time exceeded	*/
 
+/* Codes for EXT_ECHO (PROBE) */
+#define ICMP_EXT_ECHO		42
+#define ICMP_EXT_ECHOREPLY	43
+#define ICMP_EXT_MAL_QUERY	1	/* Malformed Query */
+#define ICMP_EXT_NO_IF		2	/* No such Interface */
+#define ICMP_EXT_NO_TABLE_ENT	3	/* No such Table Entry */
+#define ICMP_EXT_MULT_IFS	4	/* Multiple Interfaces Satisfy Query */
+
+/* Constants for EXT_ECHO (PROBE) */
+#define EXT_ECHOREPLY_ACTIVE	(1 << 2)/* active bit in reply message */
+#define EXT_ECHOREPLY_IPV4	(1 << 1)/* ipv4 bit in reply message */
+#define EXT_ECHOREPLY_IPV6	1	/* ipv6 bit in reply message */
+#define EXT_ECHO_CTYPE_NAME	1
+#define EXT_ECHO_CTYPE_INDEX	2
+#define EXT_ECHO_CTYPE_ADDR	3
+#define ICMP_AFI_IP		1	/* Address Family Identifier for ipv4 */
+#define ICMP_AFI_IP6		2	/* Address Family Identifier for ipv6 */
 
 struct icmphdr {
   __u8		type;
@@ -118,4 +138,26 @@ struct icmp_extobj_hdr {
 	__u8		class_type;
 };
 
+/* RFC 8335: 2.1 Header for c-type 3 payload */
+struct icmp_ext_echo_ctype3_hdr {
+	__be16		afi;
+	__u8		addrlen;
+	__u8		reserved;
+};
+
+/* RFC 8335: 2.1 Interface Identification Object */
+struct icmp_ext_echo_iio {
+	struct icmp_extobj_hdr extobj_hdr;
+	union {
+		char name[IFNAMSIZ];
+		__be32 ifindex;
+		struct {
+			struct icmp_ext_echo_ctype3_hdr ctype3_hdr;
+			union {
+				struct in_addr	ipv4_addr;
+				struct in6_addr	ipv6_addr;
+			} ip_addr;
+		} addr;
+	} ident;
+};
 #endif /* _UAPI_LINUX_ICMP_H */
-- 
cgit v1.2.3-71-gd317


From 750f4fc2a12f6632b5aa04526bf57fa06bfe8467 Mon Sep 17 00:00:00 2001
From: Andreas Roeseler <andreas.a.roeseler@gmail.com>
Date: Mon, 29 Mar 2021 18:45:21 -0700
Subject: ICMPV6: add support for RFC 8335 PROBE

Add definitions for the ICMPV6 type of Extended Echo Request and
Extended Echo Reply, as defined by sections 2 and 3 of RFC 8335.

Signed-off-by: Andreas Roeseler <andreas.a.roeseler@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/icmpv6.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h
index 0564fd7ccde4..ecaece3af38d 100644
--- a/include/uapi/linux/icmpv6.h
+++ b/include/uapi/linux/icmpv6.h
@@ -140,6 +140,9 @@ struct icmp6hdr {
 #define ICMPV6_UNK_OPTION		2
 #define ICMPV6_HDR_INCOMP		3
 
+/* Codes for EXT_ECHO (PROBE) */
+#define ICMPV6_EXT_ECHO_REQUEST		160
+#define ICMPV6_EXT_ECHO_REPLY		161
 /*
  *	constants for (set|get)sockopt
  */
-- 
cgit v1.2.3-71-gd317


From c0a744dcaa29e9537e8607ae9c965ad936124a4d Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 23 Mar 2021 17:48:58 -0500
Subject: UAPI: nfsfh.h: Replace one-element array with flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having
a dynamically sized set of trailing elements in a structure. Kernel code
should always use “flexible array members”[1] for these cases. The older
style of one-element or zero-length arrays should no longer be used[2].

Use an anonymous union with a couple of anonymous structs in order to
keep userspace unchanged:

$ pahole -C nfs_fhbase_new fs/nfsd/nfsfh.o
struct nfs_fhbase_new {
        union {
                struct {
                        __u8       fb_version_aux;       /*     0     1 */
                        __u8       fb_auth_type_aux;     /*     1     1 */
                        __u8       fb_fsid_type_aux;     /*     2     1 */
                        __u8       fb_fileid_type_aux;   /*     3     1 */
                        __u32      fb_auth[1];           /*     4     4 */
                };                                       /*     0     8 */
                struct {
                        __u8       fb_version;           /*     0     1 */
                        __u8       fb_auth_type;         /*     1     1 */
                        __u8       fb_fsid_type;         /*     2     1 */
                        __u8       fb_fileid_type;       /*     3     1 */
                        __u32      fb_auth_flex[0];      /*     4     0 */
                };                                       /*     0     4 */
        };                                               /*     0     8 */

        /* size: 8, cachelines: 1, members: 1 */
        /* last cacheline: 8 bytes */
};

Also, this helps with the ongoing efforts to enable -Warray-bounds by
fixing the following warnings:

fs/nfsd/nfsfh.c: In function ‘nfsd_set_fh_dentry’:
fs/nfsd/nfsfh.c:191:41: warning: array subscript 1 is above array bounds of ‘__u32[1]’ {aka ‘unsigned int[1]’} [-Warray-bounds]
  191 |        ntohl((__force __be32)fh->fh_fsid[1])));
      |                              ~~~~~~~~~~~^~~
./include/linux/kdev_t.h:12:46: note: in definition of macro ‘MKDEV’
   12 | #define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi))
      |                                              ^~
./include/uapi/linux/byteorder/little_endian.h:40:26: note: in expansion of macro ‘__swab32’
   40 | #define __be32_to_cpu(x) __swab32((__force __u32)(__be32)(x))
      |                          ^~~~~~~~
./include/linux/byteorder/generic.h:136:21: note: in expansion of macro ‘__be32_to_cpu’
  136 | #define ___ntohl(x) __be32_to_cpu(x)
      |                     ^~~~~~~~~~~~~
./include/linux/byteorder/generic.h:140:18: note: in expansion of macro ‘___ntohl’
  140 | #define ntohl(x) ___ntohl(x)
      |                  ^~~~~~~~
fs/nfsd/nfsfh.c:191:8: note: in expansion of macro ‘ntohl’
  191 |        ntohl((__force __be32)fh->fh_fsid[1])));
      |        ^~~~~
fs/nfsd/nfsfh.c:192:32: warning: array subscript 2 is above array bounds of ‘__u32[1]’ {aka ‘unsigned int[1]’} [-Warray-bounds]
  192 |    fh->fh_fsid[1] = fh->fh_fsid[2];
      |                     ~~~~~~~~~~~^~~
fs/nfsd/nfsfh.c:192:15: warning: array subscript 1 is above array bounds of ‘__u32[1]’ {aka ‘unsigned int[1]’} [-Warray-bounds]
  192 |    fh->fh_fsid[1] = fh->fh_fsid[2];
      |    ~~~~~~~~~~~^~~

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/79
Link: https://github.com/KSPP/linux/issues/109
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/uapi/linux/nfsd/nfsfh.h | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h
index ff0ca88b1c8f..427294dd56a1 100644
--- a/include/uapi/linux/nfsd/nfsfh.h
+++ b/include/uapi/linux/nfsd/nfsfh.h
@@ -64,13 +64,24 @@ struct nfs_fhbase_old {
  *   in include/linux/exportfs.h for currently registered values.
  */
 struct nfs_fhbase_new {
-	__u8		fb_version;	/* == 1, even => nfs_fhbase_old */
-	__u8		fb_auth_type;
-	__u8		fb_fsid_type;
-	__u8		fb_fileid_type;
-	__u32		fb_auth[1];
-/*	__u32		fb_fsid[0]; floating */
-/*	__u32		fb_fileid[0]; floating */
+	union {
+		struct {
+			__u8		fb_version_aux;	/* == 1, even => nfs_fhbase_old */
+			__u8		fb_auth_type_aux;
+			__u8		fb_fsid_type_aux;
+			__u8		fb_fileid_type_aux;
+			__u32		fb_auth[1];
+			/*	__u32		fb_fsid[0]; floating */
+			/*	__u32		fb_fileid[0]; floating */
+		};
+		struct {
+			__u8		fb_version;	/* == 1, even => nfs_fhbase_old */
+			__u8		fb_auth_type;
+			__u8		fb_fsid_type;
+			__u8		fb_fileid_type;
+			__u32		fb_auth_flex[]; /* flexible-array member */
+		};
+	};
 };
 
 struct knfsd_fh {
@@ -97,7 +108,7 @@ struct knfsd_fh {
 #define	fh_fsid_type		fh_base.fh_new.fb_fsid_type
 #define	fh_auth_type		fh_base.fh_new.fb_auth_type
 #define	fh_fileid_type		fh_base.fh_new.fb_fileid_type
-#define	fh_fsid			fh_base.fh_new.fb_auth
+#define	fh_fsid			fh_base.fh_new.fb_auth_flex
 
 /* Do not use, provided for userspace compatiblity. */
 #define	fh_auth			fh_base.fh_new.fb_auth
-- 
cgit v1.2.3-71-gd317


From 1e5d1f69d9fb8ea0679f9e85915e8e7fdacfbe7a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 29 Mar 2021 20:59:52 -0700
Subject: ethtool: support FEC settings over netlink

Add FEC API to netlink.

This is not a 1-to-1 conversion.

FEC settings already depend on link modes to tell user which
modes are supported. Take this further an use link modes for
manual configuration. Old struct ethtool_fecparam is still
used to talk to the drivers, so we need to translate back
and forth. We can revisit the internal API if number of FEC
encodings starts to grow.

Enforce only one active FEC bit (by using a bit position
rather than another mask).

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst |  62 ++++++-
 include/uapi/linux/ethtool_netlink.h         |  17 ++
 net/ethtool/Makefile                         |   2 +-
 net/ethtool/fec.c                            | 238 +++++++++++++++++++++++++++
 net/ethtool/netlink.c                        |  19 +++
 net/ethtool/netlink.h                        |   4 +
 6 files changed, 339 insertions(+), 3 deletions(-)
 create mode 100644 net/ethtool/fec.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 05073482db05..4bdb4298f178 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -208,6 +208,8 @@ Userspace to kernel:
   ``ETHTOOL_MSG_CABLE_TEST_ACT``        action start cable test
   ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT``    action start raw TDR cable test
   ``ETHTOOL_MSG_TUNNEL_INFO_GET``       get tunnel offload info
+  ``ETHTOOL_MSG_FEC_GET``               get FEC settings
+  ``ETHTOOL_MSG_FEC_SET``               set FEC settings
   ===================================== ================================
 
 Kernel to userspace:
@@ -242,6 +244,8 @@ Kernel to userspace:
   ``ETHTOOL_MSG_CABLE_TEST_NTF``        Cable test results
   ``ETHTOOL_MSG_CABLE_TEST_TDR_NTF``    Cable test TDR results
   ``ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY`` tunnel offload info
+  ``ETHTOOL_MSG_FEC_GET_REPLY``         FEC settings
+  ``ETHTOOL_MSG_FEC_NTF``               FEC settings
   ===================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1280,6 +1284,60 @@ Kernel response contents:
 For UDP tunnel table empty ``ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES`` indicates that
 the table contains static entries, hard-coded by the NIC.
 
+FEC_GET
+=======
+
+Gets FEC configuration and state like ``ETHTOOL_GFECPARAM`` ioctl request.
+
+Request contents:
+
+  =====================================  ======  ==========================
+  ``ETHTOOL_A_FEC_HEADER``               nested  request header
+  =====================================  ======  ==========================
+
+Kernel response contents:
+
+  =====================================  ======  ==========================
+  ``ETHTOOL_A_FEC_HEADER``               nested  request header
+  ``ETHTOOL_A_FEC_MODES``                bitset  configured modes
+  ``ETHTOOL_A_FEC_AUTO``                 bool    FEC mode auto selection
+  ``ETHTOOL_A_FEC_ACTIVE``               u32     index of active FEC mode
+  =====================================  ======  ==========================
+
+``ETHTOOL_A_FEC_ACTIVE`` is the bit index of the FEC link mode currently
+active on the interface. This attribute may not be present if device does
+not support FEC.
+
+``ETHTOOL_A_FEC_MODES`` and ``ETHTOOL_A_FEC_AUTO`` are only meaningful when
+autonegotiation is disabled. If ``ETHTOOL_A_FEC_AUTO`` is non-zero driver will
+select the FEC mode automatically based on the parameters of the SFP module.
+This is equivalent to the ``ETHTOOL_FEC_AUTO`` bit of the ioctl interface.
+``ETHTOOL_A_FEC_MODES`` carry the current FEC configuration using link mode
+bits (rather than old ``ETHTOOL_FEC_*`` bits).
+
+FEC_SET
+=======
+
+Sets FEC parameters like ``ETHTOOL_SFECPARAM`` ioctl request.
+
+Request contents:
+
+  =====================================  ======  ==========================
+  ``ETHTOOL_A_FEC_HEADER``               nested  request header
+  ``ETHTOOL_A_FEC_MODES``                bitset  configured modes
+  ``ETHTOOL_A_FEC_AUTO``                 bool    FEC mode auto selection
+  =====================================  ======  ==========================
+
+``FEC_SET`` is only meaningful when autonegotiation is disabled. Otherwise
+FEC mode is selected as part of autonegotiation.
+
+``ETHTOOL_A_FEC_MODES`` selects which FEC mode should be used. It's recommended
+to set only one bit, if multiple bits are set driver may choose between them
+in an implementation specific way.
+
+``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP
+module parameters. This does not mean autonegotiation.
+
 Request translation
 ===================
 
@@ -1373,8 +1431,8 @@ are netlink only.
                                       ``ETHTOOL_MSG_LINKMODES_SET``
   ``ETHTOOL_PHY_GTUNABLE``            n/a
   ``ETHTOOL_PHY_STUNABLE``            n/a
-  ``ETHTOOL_GFECPARAM``               n/a
-  ``ETHTOOL_SFECPARAM``               n/a
+  ``ETHTOOL_GFECPARAM``               ``ETHTOOL_MSG_FEC_GET``
+  ``ETHTOOL_SFECPARAM``               ``ETHTOOL_MSG_FEC_SET``
   n/a                                 ''ETHTOOL_MSG_CABLE_TEST_ACT''
   n/a                                 ''ETHTOOL_MSG_CABLE_TEST_TDR_ACT''
   n/a                                 ``ETHTOOL_MSG_TUNNEL_INFO_GET``
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index a286635ac9b8..7f1bdb5b31ba 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -42,6 +42,8 @@ enum {
 	ETHTOOL_MSG_CABLE_TEST_ACT,
 	ETHTOOL_MSG_CABLE_TEST_TDR_ACT,
 	ETHTOOL_MSG_TUNNEL_INFO_GET,
+	ETHTOOL_MSG_FEC_GET,
+	ETHTOOL_MSG_FEC_SET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -80,6 +82,8 @@ enum {
 	ETHTOOL_MSG_CABLE_TEST_NTF,
 	ETHTOOL_MSG_CABLE_TEST_TDR_NTF,
 	ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY,
+	ETHTOOL_MSG_FEC_GET_REPLY,
+	ETHTOOL_MSG_FEC_NTF,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -629,6 +633,19 @@ enum {
 	ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1)
 };
 
+/* FEC */
+
+enum {
+	ETHTOOL_A_FEC_UNSPEC,
+	ETHTOOL_A_FEC_HEADER,				/* nest - _A_HEADER_* */
+	ETHTOOL_A_FEC_MODES,				/* bitset */
+	ETHTOOL_A_FEC_AUTO,				/* u8 */
+	ETHTOOL_A_FEC_ACTIVE,				/* u32 */
+
+	__ETHTOOL_A_FEC_CNT,
+	ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 7a849ff22dad..c2dc9033a8f7 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-		   tunnels.o
+		   tunnels.o fec.o
diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c
new file mode 100644
index 000000000000..31454b9188bd
--- /dev/null
+++ b/net/ethtool/fec.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct fec_req_info {
+	struct ethnl_req_info		base;
+};
+
+struct fec_reply_data {
+	struct ethnl_reply_data		base;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes);
+	u32 active_fec;
+	u8 fec_auto;
+};
+
+#define FEC_REPDATA(__reply_base) \
+	container_of(__reply_base, struct fec_reply_data, base)
+
+#define ETHTOOL_FEC_MASK	((ETHTOOL_FEC_LLRS << 1) - 1)
+
+const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = {
+	[ETHTOOL_A_FEC_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static void
+ethtool_fec_to_link_modes(u32 fec, unsigned long *link_modes, u8 *fec_auto)
+{
+	if (fec_auto)
+		*fec_auto = !!(fec & ETHTOOL_FEC_AUTO);
+
+	if (fec & ETHTOOL_FEC_OFF)
+		__set_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes);
+	if (fec & ETHTOOL_FEC_RS)
+		__set_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes);
+	if (fec & ETHTOOL_FEC_BASER)
+		__set_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes);
+	if (fec & ETHTOOL_FEC_LLRS)
+		__set_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes);
+}
+
+static int
+ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec,
+			       unsigned long *link_modes, u8 fec_auto)
+{
+	memset(fec, 0, sizeof(*fec));
+
+	if (fec_auto)
+		fec->fec |= ETHTOOL_FEC_AUTO;
+
+	if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes))
+		fec->fec |= ETHTOOL_FEC_OFF;
+	if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes))
+		fec->fec |= ETHTOOL_FEC_RS;
+	if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes))
+		fec->fec |= ETHTOOL_FEC_BASER;
+	if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes))
+		fec->fec |= ETHTOOL_FEC_LLRS;
+
+	if (!bitmap_empty(link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int fec_prepare_data(const struct ethnl_req_info *req_base,
+			    struct ethnl_reply_data *reply_base,
+			    struct genl_info *info)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(active_fec_modes) = {};
+	struct fec_reply_data *data = FEC_REPDATA(reply_base);
+	struct net_device *dev = reply_base->dev;
+	struct ethtool_fecparam fec = {};
+	int ret;
+
+	if (!dev->ethtool_ops->get_fecparam)
+		return -EOPNOTSUPP;
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		return ret;
+	ret = dev->ethtool_ops->get_fecparam(dev, &fec);
+	ethnl_ops_complete(dev);
+	if (ret)
+		return ret;
+
+	WARN_ON_ONCE(fec.reserved);
+
+	ethtool_fec_to_link_modes(fec.fec, data->fec_link_modes,
+				  &data->fec_auto);
+
+	ethtool_fec_to_link_modes(fec.active_fec, active_fec_modes, NULL);
+	data->active_fec = find_first_bit(active_fec_modes,
+					  __ETHTOOL_LINK_MODE_MASK_NBITS);
+	/* Don't report attr if no FEC mode set. Note that
+	 * ethtool_fecparam_to_link_modes() ignores NONE and AUTO.
+	 */
+	if (data->active_fec == __ETHTOOL_LINK_MODE_MASK_NBITS)
+		data->active_fec = 0;
+
+	return 0;
+}
+
+static int fec_reply_size(const struct ethnl_req_info *req_base,
+			  const struct ethnl_reply_data *reply_base)
+{
+	bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+	const struct fec_reply_data *data = FEC_REPDATA(reply_base);
+	int len = 0;
+	int ret;
+
+	ret = ethnl_bitset_size(data->fec_link_modes, NULL,
+				__ETHTOOL_LINK_MODE_MASK_NBITS,
+				link_mode_names, compact);
+	if (ret < 0)
+		return ret;
+	len += ret;
+
+	len += nla_total_size(sizeof(u8)) +	/* _FEC_AUTO */
+	       nla_total_size(sizeof(u32));	/* _FEC_ACTIVE */
+
+	return len;
+}
+
+static int fec_fill_reply(struct sk_buff *skb,
+			  const struct ethnl_req_info *req_base,
+			  const struct ethnl_reply_data *reply_base)
+{
+	bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+	const struct fec_reply_data *data = FEC_REPDATA(reply_base);
+	int ret;
+
+	ret = ethnl_put_bitset(skb, ETHTOOL_A_FEC_MODES,
+			       data->fec_link_modes, NULL,
+			       __ETHTOOL_LINK_MODE_MASK_NBITS,
+			       link_mode_names, compact);
+	if (ret < 0)
+		return ret;
+
+	if (nla_put_u8(skb, ETHTOOL_A_FEC_AUTO, data->fec_auto) ||
+	    (data->active_fec &&
+	     nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec)))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+const struct ethnl_request_ops ethnl_fec_request_ops = {
+	.request_cmd		= ETHTOOL_MSG_FEC_GET,
+	.reply_cmd		= ETHTOOL_MSG_FEC_GET_REPLY,
+	.hdr_attr		= ETHTOOL_A_FEC_HEADER,
+	.req_info_size		= sizeof(struct fec_req_info),
+	.reply_data_size	= sizeof(struct fec_reply_data),
+
+	.prepare_data		= fec_prepare_data,
+	.reply_size		= fec_reply_size,
+	.fill_reply		= fec_fill_reply,
+};
+
+/* FEC_SET */
+
+const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = {
+	[ETHTOOL_A_FEC_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_FEC_MODES]	= { .type = NLA_NESTED },
+	[ETHTOOL_A_FEC_AUTO]	= NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes) = {};
+	struct ethnl_req_info req_info = {};
+	struct nlattr **tb = info->attrs;
+	struct ethtool_fecparam fec = {};
+	const struct ethtool_ops *ops;
+	struct net_device *dev;
+	bool mod = false;
+	u8 fec_auto;
+	int ret;
+
+	ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEC_HEADER],
+					 genl_info_net(info), info->extack,
+					 true);
+	if (ret < 0)
+		return ret;
+	dev = req_info.dev;
+	ops = dev->ethtool_ops;
+	ret = -EOPNOTSUPP;
+	if (!ops->get_fecparam || !ops->set_fecparam)
+		goto out_dev;
+
+	rtnl_lock();
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		goto out_rtnl;
+	ret = ops->get_fecparam(dev, &fec);
+	if (ret < 0)
+		goto out_ops;
+
+	ethtool_fec_to_link_modes(fec.fec, fec_link_modes, &fec_auto);
+
+	ret = ethnl_update_bitset(fec_link_modes,
+				  __ETHTOOL_LINK_MODE_MASK_NBITS,
+				  tb[ETHTOOL_A_FEC_MODES],
+				  link_mode_names, info->extack, &mod);
+	if (ret < 0)
+		goto out_ops;
+	ethnl_update_u8(&fec_auto, tb[ETHTOOL_A_FEC_AUTO], &mod);
+
+	ret = 0;
+	if (!mod)
+		goto out_ops;
+
+	ret = ethtool_link_modes_to_fecparam(&fec, fec_link_modes, fec_auto);
+	if (ret) {
+		NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES],
+				    "invalid FEC modes requested");
+		goto out_ops;
+	}
+	if (!fec.fec) {
+		ret = -EINVAL;
+		NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES],
+				    "no FEC modes set");
+		goto out_ops;
+	}
+
+	ret = dev->ethtool_ops->set_fecparam(dev, &fec);
+	if (ret < 0)
+		goto out_ops;
+	ethtool_notify(dev, ETHTOOL_MSG_FEC_NTF, NULL);
+
+out_ops:
+	ethnl_ops_complete(dev);
+out_rtnl:
+	rtnl_unlock();
+out_dev:
+	dev_put(dev);
+	return ret;
+}
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 50d3c8896f91..705a4b201564 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -244,6 +244,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_COALESCE_GET]	= &ethnl_coalesce_request_ops,
 	[ETHTOOL_MSG_PAUSE_GET]		= &ethnl_pause_request_ops,
 	[ETHTOOL_MSG_EEE_GET]		= &ethnl_eee_request_ops,
+	[ETHTOOL_MSG_FEC_GET]		= &ethnl_fec_request_ops,
 	[ETHTOOL_MSG_TSINFO_GET]	= &ethnl_tsinfo_request_ops,
 };
 
@@ -551,6 +552,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = {
 	[ETHTOOL_MSG_COALESCE_NTF]	= &ethnl_coalesce_request_ops,
 	[ETHTOOL_MSG_PAUSE_NTF]		= &ethnl_pause_request_ops,
 	[ETHTOOL_MSG_EEE_NTF]		= &ethnl_eee_request_ops,
+	[ETHTOOL_MSG_FEC_NTF]		= &ethnl_fec_request_ops,
 };
 
 /* default notification handler */
@@ -643,6 +645,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
 	[ETHTOOL_MSG_COALESCE_NTF]	= ethnl_default_notify,
 	[ETHTOOL_MSG_PAUSE_NTF]		= ethnl_default_notify,
 	[ETHTOOL_MSG_EEE_NTF]		= ethnl_default_notify,
+	[ETHTOOL_MSG_FEC_NTF]		= ethnl_default_notify,
 };
 
 void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data)
@@ -912,6 +915,22 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_tunnel_info_get_policy,
 		.maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_FEC_GET,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_default_start,
+		.dumpit	= ethnl_default_dumpit,
+		.done	= ethnl_default_done,
+		.policy = ethnl_fec_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1,
+	},
+	{
+		.cmd	= ETHTOOL_MSG_FEC_SET,
+		.flags	= GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_set_fec,
+		.policy = ethnl_fec_set_policy,
+		.maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 6eabd58d81bf..785f7ee45930 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -344,6 +344,7 @@ extern const struct ethnl_request_ops ethnl_coalesce_request_ops;
 extern const struct ethnl_request_ops ethnl_pause_request_ops;
 extern const struct ethnl_request_ops ethnl_eee_request_ops;
 extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
+extern const struct ethnl_request_ops ethnl_fec_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -375,6 +376,8 @@ extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER +
 extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1];
 extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1];
 extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1];
+extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
+extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
@@ -392,5 +395,6 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info);
 int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info);
 int ethnl_tunnel_info_start(struct netlink_callback *cb);
 int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info);
 
 #endif /* _NET_ETHTOOL_NETLINK_H */
-- 
cgit v1.2.3-71-gd317


From a7ba4558e69a3c2ae4ca521f015832ef44799538 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:30 -0700
Subject: sock_map: Introduce BPF_SK_SKB_VERDICT

Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is
confusing and more importantly we still want to distinguish them
from user-space. So we can just reuse the stream verdict code but
introduce a new type of eBPF program, skb_verdict. Users are not
allowed to attach stream_verdict and skb_verdict programs to the
same map.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-10-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h          |  2 ++
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           |  1 +
 net/core/skmsg.c               |  4 +++-
 net/core/sock_map.c            | 28 ++++++++++++++++++++++++++++
 tools/bpf/bpftool/common.c     |  1 +
 tools/bpf/bpftool/prog.c       |  1 +
 tools/include/uapi/linux/bpf.h |  1 +
 8 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e7aba150539d..c83dbc2d81d9 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -58,6 +58,7 @@ struct sk_psock_progs {
 	struct bpf_prog			*msg_parser;
 	struct bpf_prog			*stream_parser;
 	struct bpf_prog			*stream_verdict;
+	struct bpf_prog			*skb_verdict;
 };
 
 enum sk_psock_state_bits {
@@ -487,6 +488,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs)
 	psock_set_prog(&progs->msg_parser, NULL);
 	psock_set_prog(&progs->stream_parser, NULL);
 	psock_set_prog(&progs->stream_verdict, NULL);
+	psock_set_prog(&progs->skb_verdict, NULL);
 }
 
 int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 598716742593..49371eba98ba 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -957,6 +957,7 @@ enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_SK_SKB_VERDICT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9603de81811a..6428634da57e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2948,6 +2948,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_SK_MSG;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
+	case BPF_SK_SKB_VERDICT:
 		return BPF_PROG_TYPE_SK_SKB;
 	case BPF_LIRC_MODE2:
 		return BPF_PROG_TYPE_LIRC_MODE2;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 656eceab73bc..a045812d7c78 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 	rcu_assign_sk_user_data(sk, NULL);
 	if (psock->progs.stream_parser)
 		sk_psock_stop_strp(sk, psock);
-	else if (psock->progs.stream_verdict)
+	else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
 		sk_psock_stop_verdict(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
 
@@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
 	}
 	skb_set_owner_r(skb, sk);
 	prog = READ_ONCE(psock->progs.stream_verdict);
+	if (!prog)
+		prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
 		skb_dst_drop(skb);
 		skb_bpf_redirect_clear(skb);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 42d797291d34..c2a0411e08a8 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -156,6 +156,8 @@ static void sock_map_del_link(struct sock *sk,
 				strp_stop = true;
 			if (psock->saved_data_ready && stab->progs.stream_verdict)
 				verdict_stop = true;
+			if (psock->saved_data_ready && stab->progs.skb_verdict)
+				verdict_stop = true;
 			list_del(&link->list);
 			sk_psock_free_link(link);
 		}
@@ -232,6 +234,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 	struct sk_psock_progs *progs = sock_map_progs(map);
 	struct bpf_prog *stream_verdict = NULL;
 	struct bpf_prog *stream_parser = NULL;
+	struct bpf_prog *skb_verdict = NULL;
 	struct bpf_prog *msg_parser = NULL;
 	struct sk_psock *psock;
 	int ret;
@@ -268,6 +271,15 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 		}
 	}
 
+	skb_verdict = READ_ONCE(progs->skb_verdict);
+	if (skb_verdict) {
+		skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+		if (IS_ERR(skb_verdict)) {
+			ret = PTR_ERR(skb_verdict);
+			goto out_put_msg_parser;
+		}
+	}
+
 no_progs:
 	psock = sock_map_psock_get_checked(sk);
 	if (IS_ERR(psock)) {
@@ -278,6 +290,9 @@ no_progs:
 	if (psock) {
 		if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
 		    (stream_parser  && READ_ONCE(psock->progs.stream_parser)) ||
+		    (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+		    (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) ||
+		    (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
 		    (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
 			sk_psock_put(sk, psock);
 			ret = -EBUSY;
@@ -309,6 +324,9 @@ no_progs:
 	} else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
 		psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
 		sk_psock_start_verdict(sk,psock);
+	} else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) {
+		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+		sk_psock_start_verdict(sk, psock);
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
 	return 0;
@@ -317,6 +335,9 @@ out_unlock_drop:
 out_drop:
 	sk_psock_put(sk, psock);
 out_progs:
+	if (skb_verdict)
+		bpf_prog_put(skb_verdict);
+out_put_msg_parser:
 	if (msg_parser)
 		bpf_prog_put(msg_parser);
 out_put_stream_parser:
@@ -1442,8 +1463,15 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
 		break;
 #endif
 	case BPF_SK_SKB_STREAM_VERDICT:
+		if (progs->skb_verdict)
+			return -EBUSY;
 		pprog = &progs->stream_verdict;
 		break;
+	case BPF_SK_SKB_VERDICT:
+		if (progs->stream_verdict)
+			return -EBUSY;
+		pprog = &progs->skb_verdict;
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 65303664417e..1828bba19020 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
 
 	[BPF_SK_SKB_STREAM_PARSER]	= "sk_skb_stream_parser",
 	[BPF_SK_SKB_STREAM_VERDICT]	= "sk_skb_stream_verdict",
+	[BPF_SK_SKB_VERDICT]		= "sk_skb_verdict",
 	[BPF_SK_MSG_VERDICT]		= "sk_msg_verdict",
 	[BPF_LIRC_MODE2]		= "lirc_mode2",
 	[BPF_FLOW_DISSECTOR]		= "flow_dissector",
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index f2b915b20546..3f067d2d7584 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -76,6 +76,7 @@ enum dump_mode {
 static const char * const attach_type_strings[] = {
 	[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
 	[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
+	[BPF_SK_SKB_VERDICT] = "skb_verdict",
 	[BPF_SK_MSG_VERDICT] = "msg_verdict",
 	[BPF_FLOW_DISSECTOR] = "flow_dissector",
 	[__MAX_BPF_ATTACH_TYPE] = NULL,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ab9f2233607c..69902603012c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -957,6 +957,7 @@ enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_SK_SKB_VERDICT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
-- 
cgit v1.2.3-71-gd317


From b9c6cdc37ee1fe5866d3b1c10efb9d03191a76af Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Apr 2021 19:17:31 +0200
Subject: block: update a few comments in uapi/linux/blkpg.h

The big top of the file comment talk about grand plans that never
happened, so remove them to not confuse the readers.  Also mark the
devname and volname fields as ignored as they were never used by the
kernel.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blkpg.h | 28 ++--------------------------
 1 file changed, 2 insertions(+), 26 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/blkpg.h b/include/uapi/linux/blkpg.h
index ac6474e4f29d..d0a64ee97c6d 100644
--- a/include/uapi/linux/blkpg.h
+++ b/include/uapi/linux/blkpg.h
@@ -2,29 +2,6 @@
 #ifndef _UAPI__LINUX_BLKPG_H
 #define _UAPI__LINUX_BLKPG_H
 
-/*
- * Partition table and disk geometry handling
- *
- * A single ioctl with lots of subfunctions:
- *
- * Device number stuff:
- *    get_whole_disk()		(given the device number of a partition,
- *                               find the device number of the encompassing disk)
- *    get_all_partitions()	(given the device number of a disk, return the
- *				 device numbers of all its known partitions)
- *
- * Partition stuff:
- *    add_partition()
- *    delete_partition()
- *    test_partition_in_use()	(also for test_disk_in_use)
- *
- * Geometry stuff:
- *    get_geometry()
- *    set_geometry()
- *    get_bios_drivedata()
- *
- * For today, only the partition stuff - aeb, 990515
- */
 #include <linux/compiler.h>
 #include <linux/ioctl.h>
 
@@ -52,9 +29,8 @@ struct blkpg_partition {
 	long long start;		/* starting offset in bytes */
 	long long length;		/* length in bytes */
 	int pno;			/* partition number */
-	char devname[BLKPG_DEVNAMELTH];	/* partition name, like sda5 or c0d1p2,
-					   to be used in kernel messages */
-	char volname[BLKPG_VOLNAMELTH];	/* volume label */
+	char devname[BLKPG_DEVNAMELTH];	/* unused / ignored */
+	char volname[BLKPG_VOLNAMELTH];	/* unused / ignore */
 };
 
 #endif /* _UAPI__LINUX_BLKPG_H */
-- 
cgit v1.2.3-71-gd317


From dc87efdb1a5cd46134a9d490480160e303bc6eef Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 1 Apr 2021 16:19:44 -0700
Subject: mptcp: add mptcp reset option support

The MPTCP reset option allows to carry a mptcp-specific error code that
provides more information on the nature of a connection reset.

Reset option data received gets stored in the subflow context so it can
be sent to userspace via the 'subflow closed' netlink event.

When a subflow is closed, the desired error code that should be sent to
the peer is also placed in the subflow context structure.

If a reset is sent before subflow establishment could complete, e.g. on
HMAC failure during an MP_JOIN operation, the mptcp skb extension is
used to store the reset information.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mptcp.h        | 18 +++++++++++--
 include/uapi/linux/mptcp.h | 11 ++++++++
 net/ipv4/tcp_ipv4.c        | 21 ++++++++++++---
 net/ipv6/tcp_ipv6.c        | 14 +++++++++-
 net/mptcp/options.c        | 67 ++++++++++++++++++++++++++++++++++++++++++----
 net/mptcp/pm_netlink.c     | 12 +++++++++
 net/mptcp/protocol.c       | 12 ++++++---
 net/mptcp/protocol.h       | 14 +++++++++-
 net/mptcp/subflow.c        | 30 ++++++++++++++++++---
 9 files changed, 180 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index cea69c801595..16fe34d139c3 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -30,8 +30,8 @@ struct mptcp_ext {
 			ack64:1,
 			mpc_map:1,
 			frozen:1,
-			__unused:1;
-	/* one byte hole */
+			reset_transient:1;
+	u8		reset_reason:4;
 };
 
 #define MPTCP_RM_IDS_MAX	8
@@ -58,6 +58,8 @@ struct mptcp_out_options {
 	struct mptcp_rm_list rm_list;
 	u8 join_id;
 	u8 backup;
+	u8 reset_reason:4;
+	u8 reset_transient:1;
 	u32 nonce;
 	u64 thmac;
 	u32 token;
@@ -156,6 +158,16 @@ void mptcp_seq_show(struct seq_file *seq);
 int mptcp_subflow_init_cookie_req(struct request_sock *req,
 				  const struct sock *sk_listener,
 				  struct sk_buff *skb);
+
+__be32 mptcp_get_reset_option(const struct sk_buff *skb);
+
+static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
+{
+	if (skb_ext_exist(skb, SKB_EXT_MPTCP))
+		return mptcp_get_reset_option(skb);
+
+	return htonl(0u);
+}
 #else
 
 static inline void mptcp_init(void)
@@ -236,6 +248,8 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req,
 {
 	return 0; /* TCP fallback */
 }
+
+static inline __be32 mptcp_reset_option(const struct sk_buff *skb)  { return htonl(0u); }
 #endif /* CONFIG_MPTCP */
 
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index e1172c1ffdfd..8eb3c0844bff 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -174,10 +174,21 @@ enum mptcp_event_attr {
 	MPTCP_ATTR_FLAGS,	/* u16 */
 	MPTCP_ATTR_TIMEOUT,	/* u32 */
 	MPTCP_ATTR_IF_IDX,	/* s32 */
+	MPTCP_ATTR_RESET_REASON,/* u32 */
+	MPTCP_ATTR_RESET_FLAGS, /* u32 */
 
 	__MPTCP_ATTR_AFTER_LAST
 };
 
 #define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1)
 
+/* MPTCP Reset reason codes, rfc8684 */
+#define MPTCP_RST_EUNSPEC	0
+#define MPTCP_RST_EMPTCP	1
+#define MPTCP_RST_ERESOURCE	2
+#define MPTCP_RST_EPROHIBIT	3
+#define MPTCP_RST_EWQ2BIG	4
+#define MPTCP_RST_EBADPERF	5
+#define MPTCP_RST_EMIDDLEBOX	6
+
 #endif /* _UAPI_MPTCP_H */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dfc6d1c0e710..312184cead57 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -655,14 +655,18 @@ EXPORT_SYMBOL(tcp_v4_send_check);
  *	Exception: precedence violation. We do not implement it in any case.
  */
 
+#ifdef CONFIG_TCP_MD5SIG
+#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
+#else
+#define OPTION_BYTES sizeof(__be32)
+#endif
+
 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct {
 		struct tcphdr th;
-#ifdef CONFIG_TCP_MD5SIG
-		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
-#endif
+		__be32 opt[OPTION_BYTES / sizeof(__be32)];
 	} rep;
 	struct ip_reply_arg arg;
 #ifdef CONFIG_TCP_MD5SIG
@@ -770,6 +774,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 				     ip_hdr(skb)->daddr, &rep.th);
 	}
 #endif
+	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
+	if (rep.opt[0] == 0) {
+		__be32 mrst = mptcp_reset_option(skb);
+
+		if (mrst) {
+			rep.opt[0] = mrst;
+			arg.iov[0].iov_len += sizeof(mrst);
+			rep.th.doff = arg.iov[0].iov_len / 4;
+		}
+	}
+
 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 				      ip_hdr(skb)->saddr, /* XXX */
 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index bff22d6ef516..5f47c0b6e3de 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -879,8 +879,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 	struct sock *ctl_sk = net->ipv6.tcp_sk;
 	unsigned int tot_len = sizeof(struct tcphdr);
+	__be32 mrst = 0, *topt;
 	struct dst_entry *dst;
-	__be32 *topt;
 	__u32 mark = 0;
 
 	if (tsecr)
@@ -890,6 +890,15 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		tot_len += TCPOLEN_MD5SIG_ALIGNED;
 #endif
 
+#ifdef CONFIG_MPTCP
+	if (rst && !key) {
+		mrst = mptcp_reset_option(skb);
+
+		if (mrst)
+			tot_len += sizeof(__be32);
+	}
+#endif
+
 	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
 			 GFP_ATOMIC);
 	if (!buff)
@@ -920,6 +929,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		*topt++ = htonl(tsecr);
 	}
 
+	if (mrst)
+		*topt++ = mrst;
+
 #ifdef CONFIG_TCP_MD5SIG
 	if (key) {
 		*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 68361d28dc67..4b7119eb2c31 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -305,6 +305,18 @@ static void mptcp_parse_option(const struct sk_buff *skb,
 		mp_opt->fastclose = 1;
 		break;
 
+	case MPTCPOPT_RST:
+		if (opsize != TCPOLEN_MPTCP_RST)
+			break;
+
+		if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
+			break;
+		mp_opt->reset = 1;
+		flags = *ptr++;
+		mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT;
+		mp_opt->reset_reason = *ptr;
+		break;
+
 	default:
 		break;
 	}
@@ -327,6 +339,7 @@ void mptcp_get_options(const struct sk_buff *skb,
 	mp_opt->rm_addr = 0;
 	mp_opt->dss = 0;
 	mp_opt->mp_prio = 0;
+	mp_opt->reset = 0;
 
 	length = (th->doff * 4) - sizeof(struct tcphdr);
 	ptr = (const unsigned char *)(th + 1);
@@ -726,6 +739,22 @@ static bool mptcp_established_options_mp_prio(struct sock *sk,
 	return true;
 }
 
+static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb,
+						   unsigned int *size,
+						   unsigned int remaining,
+						   struct mptcp_out_options *opts)
+{
+	const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+	if (remaining < TCPOLEN_MPTCP_RST)
+		return;
+
+	*size = TCPOLEN_MPTCP_RST;
+	opts->suboptions |= OPTION_MPTCP_RST;
+	opts->reset_transient = subflow->reset_transient;
+	opts->reset_reason = subflow->reset_reason;
+}
+
 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 			       unsigned int *size, unsigned int remaining,
 			       struct mptcp_out_options *opts)
@@ -741,11 +770,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 	if (unlikely(__mptcp_check_fallback(msk)))
 		return false;
 
-	/* prevent adding of any MPTCP related options on reset packet
-	 * until we support MP_TCPRST/MP_FASTCLOSE
-	 */
-	if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
-		return false;
+	if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
+		mptcp_established_options_rst(sk, skb, size, remaining, opts);
+		return true;
+	}
 
 	snd_data_fin = mptcp_data_fin_enabled(msk);
 	if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts))
@@ -1062,6 +1090,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 		mp_opt.mp_prio = 0;
 	}
 
+	if (mp_opt.reset) {
+		subflow->reset_seen = 1;
+		subflow->reset_reason = mp_opt.reset_reason;
+		subflow->reset_transient = mp_opt.reset_transient;
+	}
+
 	if (!mp_opt.dss)
 		return;
 
@@ -1289,6 +1323,12 @@ mp_capable_done:
 		ptr += 5;
 	}
 
+	if (OPTION_MPTCP_RST & opts->suboptions)
+		*ptr++ = mptcp_option(MPTCPOPT_RST,
+				      TCPOLEN_MPTCP_RST,
+				      opts->reset_transient,
+				      opts->reset_reason);
+
 	if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
 		struct mptcp_ext *mpext = &opts->ext_copy;
 		u8 len = TCPOLEN_MPTCP_DSS_BASE;
@@ -1340,3 +1380,20 @@ mp_capable_done:
 	if (tp)
 		mptcp_set_rwin(tp);
 }
+
+__be32 mptcp_get_reset_option(const struct sk_buff *skb)
+{
+	const struct mptcp_ext *ext = mptcp_get_ext(skb);
+	u8 flags, reason;
+
+	if (ext) {
+		flags = ext->reset_transient;
+		reason = ext->reset_reason;
+
+		return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST,
+				    flags, reason);
+	}
+
+	return htonl(0u);
+}
+EXPORT_SYMBOL_GPL(mptcp_get_reset_option);
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index cadafafa1049..51be6c34b339 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1687,9 +1687,21 @@ static int mptcp_event_sub_closed(struct sk_buff *skb,
 				  const struct mptcp_sock *msk,
 				  const struct sock *ssk)
 {
+	const struct mptcp_subflow_context *sf;
+
 	if (mptcp_event_put_token_and_ssk(skb, msk, ssk))
 		return -EMSGSIZE;
 
+	sf = mptcp_subflow_ctx(ssk);
+	if (!sf->reset_seen)
+		return 0;
+
+	if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason))
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 531ee24aa827..e894345d10c1 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3090,14 +3090,18 @@ bool mptcp_finish_join(struct sock *ssk)
 	pr_debug("msk=%p, subflow=%p", msk, subflow);
 
 	/* mptcp socket already closing? */
-	if (!mptcp_is_fully_established(parent))
+	if (!mptcp_is_fully_established(parent)) {
+		subflow->reset_reason = MPTCP_RST_EMPTCP;
 		return false;
+	}
 
 	if (!msk->pm.server_side)
 		goto out;
 
-	if (!mptcp_pm_allow_new_subflow(msk))
+	if (!mptcp_pm_allow_new_subflow(msk)) {
+		subflow->reset_reason = MPTCP_RST_EPROHIBIT;
 		return false;
+	}
 
 	/* active connections are already on conn_list, and we can't acquire
 	 * msk lock here.
@@ -3111,8 +3115,10 @@ bool mptcp_finish_join(struct sock *ssk)
 		sock_hold(ssk);
 	}
 	spin_unlock_bh(&msk->join_list_lock);
-	if (!ret)
+	if (!ret) {
+		subflow->reset_reason = MPTCP_RST_EPROHIBIT;
 		return false;
+	}
 
 	/* attach to msk socket only after we are sure he will deal with us
 	 * at close time
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index e8c5ff2b8ace..40e9b05856cd 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -26,6 +26,7 @@
 #define OPTION_MPTCP_RM_ADDR	BIT(8)
 #define OPTION_MPTCP_FASTCLOSE	BIT(9)
 #define OPTION_MPTCP_PRIO	BIT(10)
+#define OPTION_MPTCP_RST	BIT(11)
 
 /* MPTCP option subtypes */
 #define MPTCPOPT_MP_CAPABLE	0
@@ -36,6 +37,7 @@
 #define MPTCPOPT_MP_PRIO	5
 #define MPTCPOPT_MP_FAIL	6
 #define MPTCPOPT_MP_FASTCLOSE	7
+#define MPTCPOPT_RST		8
 
 /* MPTCP suboption lengths */
 #define TCPOLEN_MPTCP_MPC_SYN		4
@@ -65,6 +67,7 @@
 #define TCPOLEN_MPTCP_PRIO		3
 #define TCPOLEN_MPTCP_PRIO_ALIGN	4
 #define TCPOLEN_MPTCP_FASTCLOSE		12
+#define TCPOLEN_MPTCP_RST		4
 
 /* MPTCP MP_JOIN flags */
 #define MPTCPOPT_BACKUP		BIT(0)
@@ -94,6 +97,9 @@
 /* MPTCP MP_PRIO flags */
 #define MPTCP_PRIO_BKUP		BIT(0)
 
+/* MPTCP TCPRST flags */
+#define MPTCP_RST_TRANSIENT	BIT(0)
+
 /* MPTCP socket flags */
 #define MPTCP_DATA_READY	0
 #define MPTCP_NOSPACE		1
@@ -123,6 +129,7 @@ struct mptcp_options_received {
 	u16	mp_capable : 1,
 		mp_join : 1,
 		fastclose : 1,
+		reset : 1,
 		dss : 1,
 		add_addr : 1,
 		rm_addr : 1,
@@ -152,6 +159,8 @@ struct mptcp_options_received {
 	};
 	u64	ahmac;
 	u16	port;
+	u8	reset_reason:4;
+	u8	reset_transient:1;
 };
 
 static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
@@ -422,6 +431,9 @@ struct mptcp_subflow_context {
 	u8	hmac[MPTCPOPT_HMAC_LEN];
 	u8	local_id;
 	u8	remote_id;
+	u8	reset_seen:1;
+	u8	reset_transient:1;
+	u8	reset_reason:4;
 
 	long	delegated_status;
 	struct	list_head delegated_node;   /* link into delegated_action, protected by local BH */
@@ -742,7 +754,7 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk);
 unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk);
 unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk);
 
-static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
+static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)
 {
 	return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP);
 }
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 7a5f50d00d4b..223d6be5fc3b 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -115,6 +115,16 @@ static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct soc
 	return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
 }
 
+static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
+{
+	struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
+
+	if (mpext) {
+		memset(mpext, 0, sizeof(*mpext));
+		mpext->reset_reason = reason;
+	}
+}
+
 /* Init mptcp request socket.
  *
  * Returns an error code if a JOIN has failed and a TCP reset
@@ -190,8 +200,10 @@ again:
 		subflow_req->msk = subflow_token_join_request(req);
 
 		/* Can't fall back to TCP in this case. */
-		if (!subflow_req->msk)
+		if (!subflow_req->msk) {
+			subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
 			return -EPERM;
+		}
 
 		if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
 			pr_debug("syn inet_sport=%d %d",
@@ -400,8 +412,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 	} else if (subflow->request_join) {
 		u8 hmac[SHA256_DIGEST_SIZE];
 
-		if (!mp_opt.mp_join)
+		if (!mp_opt.mp_join) {
+			subflow->reset_reason = MPTCP_RST_EMPTCP;
 			goto do_reset;
+		}
 
 		subflow->thmac = mp_opt.thmac;
 		subflow->remote_nonce = mp_opt.nonce;
@@ -410,6 +424,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 
 		if (!subflow_thmac_valid(subflow)) {
 			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
+			subflow->reset_reason = MPTCP_RST_EMPTCP;
 			goto do_reset;
 		}
 
@@ -438,6 +453,7 @@ fallback:
 	return;
 
 do_reset:
+	subflow->reset_transient = 0;
 	mptcp_subflow_reset(sk);
 }
 
@@ -654,8 +670,10 @@ create_child:
 		 * to reset the context to non MPTCP status.
 		 */
 		if (!ctx || fallback) {
-			if (fallback_is_fatal)
+			if (fallback_is_fatal) {
+				subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
 				goto dispose_child;
+			}
 
 			subflow_drop_ctx(child);
 			goto out;
@@ -690,8 +708,10 @@ create_child:
 			struct mptcp_sock *owner;
 
 			owner = subflow_req->msk;
-			if (!owner)
+			if (!owner) {
+				subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
 				goto dispose_child;
+			}
 
 			/* move the msk reference ownership to the subflow */
 			subflow_req->msk = NULL;
@@ -1056,6 +1076,8 @@ fatal:
 	smp_wmb();
 	ssk->sk_error_report(ssk);
 	tcp_set_state(ssk, TCP_CLOSE);
+	subflow->reset_transient = 0;
+	subflow->reset_reason = MPTCP_RST_EMPTCP;
 	tcp_send_active_reset(ssk, GFP_ATOMIC);
 	subflow->data_avail = 0;
 	return false;
-- 
cgit v1.2.3-71-gd317


From 547b60988e631f74ed025cf1ec50cfc17f49fd13 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 5 Apr 2021 17:42:48 +0100
Subject: perf: aux: Add flags for the buffer format

Allocate a byte for advertising the PMU specific format type
of the given AUX record. A PMU could end up providing hardware
trace data in multiple format in a single session.

e.g, The format of hardware buffer produced by CoreSight ETM
PMU depends on the type of the "sink" device used for collection
for an event (Traditional TMC-ETR/Bs with formatting or
TRBEs without any formatting).

 # Boring story of why this is needed. Goto The_End_of_Story for skipping.

CoreSight ETM trace allows instruction level tracing of Arm CPUs.
The ETM generates the CPU excecution trace and pumps it into CoreSight
AMBA Trace Bus and is collected by a different CoreSight component
(traditionally CoreSight TMC-ETR /ETB/ETF), called "sink".
Important to note that there is no guarantee that every CPU has
a dedicated sink.  Thus multiple ETMs could pump the trace data
into the same "sink" and thus they apply additional formatting
of the trace data for the user to decode it properly and attribute
the trace data to the corresponding ETM.

However, with the introduction of Arm Trace buffer Extensions (TRBE),
we now have a dedicated per-CPU architected sink for collecting the
trace. Since the TRBE is always per-CPU, it doesn't apply any formatting
of the trace. The support for this driver is under review [1].

Now a system could have a per-cpu TRBE and one or more shared
TMC-ETRs on the system. A user could choose a "specific" sink
for a perf session (e.g, a TMC-ETR) or the driver could automatically
select the nearest sink for a given ETM. It is possible that
some ETMs could end up using TMC-ETR (e.g, if the TRBE is not
usable on the CPU) while the others using TRBE in a single
perf session. Thus we now have "formatted" trace collected
from TMC-ETR and "unformatted" trace collected from TRBE.
However, we don't get into a situation where a single event
could end up using TMC-ETR & TRBE. i.e, any AUX buffer is
guaranteed to be either RAW or FORMATTED, but not a mix
of both.

As for perf decoding, we need to know the type of the data
in the individual AUX buffers, so that it can set up the
"OpenCSD" (library for decoding CoreSight trace) decoder
instance appropriately. Thus the perf.data file must conatin
the hints for the tool to decode the data correctly.

Since this is a runtime variable, and perf tool doesn't have
a control on what sink gets used (in case of automatic sink
selection), we need this information made available from
the PMU driver for each AUX record.

 # The_End_of_Story

Cc: Peter Ziljstra <peterz@infradead.org>
Cc: alexander.shishkin@linux.intel.com
Cc: mingo@redhat.com
Cc: will@kernel.org
Cc: mark.rutland@arm.com
Cc: mike.leach@linaro.org
Cc: acme@kernel.org
Cc: jolsa@redhat.com
Cc: Mathieu Poirier <mathieu.poirer@linaro.org>
Reviewed by: Mike Leach <mike.leach@linaro.org>
Acked-by: Peter Ziljstra <peterz@infradead.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20210405164307.1720226-2-suzuki.poulose@arm.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/uapi/linux/perf_event.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ad15e40d7f5d..f006eeab6f0e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1156,10 +1156,11 @@ enum perf_callchain_context {
 /**
  * PERF_RECORD_AUX::flags bits
  */
-#define PERF_AUX_FLAG_TRUNCATED		0x01	/* record was truncated to fit */
-#define PERF_AUX_FLAG_OVERWRITE		0x02	/* snapshot from overwrite mode */
-#define PERF_AUX_FLAG_PARTIAL		0x04	/* record contains gaps */
-#define PERF_AUX_FLAG_COLLISION		0x08	/* sample collided with another */
+#define PERF_AUX_FLAG_TRUNCATED			0x01	/* record was truncated to fit */
+#define PERF_AUX_FLAG_OVERWRITE			0x02	/* snapshot from overwrite mode */
+#define PERF_AUX_FLAG_PARTIAL			0x04	/* record contains gaps */
+#define PERF_AUX_FLAG_COLLISION			0x08	/* sample collided with another */
+#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK	0xff00	/* PMU specific trace format type */
 
 #define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
-- 
cgit v1.2.3-71-gd317


From 7dde51767ca5339ed33109056d92fdca05d56d8d Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Mon, 5 Apr 2021 17:42:49 +0100
Subject: perf: aux: Add CoreSight PMU buffer formats

CoreSight PMU supports aux-buffer for the ETM tracing. The trace
generated by the ETM (associated with individual CPUs, like Intel PT)
is captured by a separate IP (CoreSight TMC-ETR/ETF until now).

The TMC-ETR applies formatting of the raw ETM trace data, as it
can collect traces from multiple ETMs, with the TraceID to indicate
the source of a given trace packet.

Arm Trace Buffer Extension is new "sink" IP, attached to individual
CPUs and thus do not provide additional formatting, like TMC-ETR.

Additionally, a system could have both TRBE *and* TMC-ETR for
the trace collection. e.g, TMC-ETR could be used as a single
trace buffer to collect data from multiple ETMs to correlate
the traces from different CPUs. It is possible to have a
perf session where some events end up collecting the trace
in TMC-ETR while the others in TRBE. Thus we need a way
to identify the type of the trace for each AUX record.

Define the trace formats exported by the CoreSight PMU.
We don't define the flags following the "ETM" as this
information is available to the user when issuing
the session. What is missing is the additional
formatting applied by the "sink" which is decided
at the runtime and the user may not have a control on.

So we define :
 - CORESIGHT format (indicates the Frame format)
 - RAW format (indicates the format of the source)

The default value is CORESIGHT format for all the records
(i,e == 0). Add the RAW format for others that use
raw format.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20210405164307.1720226-3-suzuki.poulose@arm.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/uapi/linux/perf_event.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index f006eeab6f0e..63971eaef127 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1162,6 +1162,10 @@ enum perf_callchain_context {
 #define PERF_AUX_FLAG_COLLISION			0x08	/* sample collided with another */
 #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK	0xff00	/* PMU specific trace format type */
 
+/* CoreSight PMU AUX buffer formats */
+#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT	0x0000 /* Default for backward compatibility */
+#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW		0x0100 /* Raw format of the source */
+
 #define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
 #define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
-- 
cgit v1.2.3-71-gd317


From b98fc6b6c0cc72ee8eda406a13967f91a135e834 Mon Sep 17 00:00:00 2001
From: Dafna Hirschfeld <dafna.hirschfeld@collabora.com>
Date: Tue, 23 Mar 2021 10:22:11 +0100
Subject: media: rkisp1: uapi: document which flags/structs relate to
 statistics config

Modify the documentation to point out which flags and structs are
used to configure the statistics.

Signed-off-by: Dafna Hirschfeld <dafna.hirschfeld@collabora.com>
Reviewed-by: Sebastian Fricke <sebastian.fricke@posteo.net>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/uapi/linux/rkisp1-config.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rkisp1-config.h b/include/uapi/linux/rkisp1-config.h
index 36e3efb81b01..583ca0d9a79d 100644
--- a/include/uapi/linux/rkisp1-config.h
+++ b/include/uapi/linux/rkisp1-config.h
@@ -15,7 +15,7 @@
 #define RKISP1_CIF_ISP_MODULE_BLS		(1U << 1)
 /* Sensor De-gamma */
 #define RKISP1_CIF_ISP_MODULE_SDG		(1U << 2)
-/* Histogram */
+/* Histogram statistics configuration */
 #define RKISP1_CIF_ISP_MODULE_HST		(1U << 3)
 /* Lens Shade Control */
 #define RKISP1_CIF_ISP_MODULE_LSC		(1U << 4)
@@ -31,13 +31,13 @@
 #define RKISP1_CIF_ISP_MODULE_GOC		(1U << 9)
 /* Color Processing */
 #define RKISP1_CIF_ISP_MODULE_CPROC		(1U << 10)
-/* Auto Focus Control */
+/* Auto Focus Control statistics configuration */
 #define RKISP1_CIF_ISP_MODULE_AFC		(1U << 11)
-/* Auto White Balancing */
+/* Auto White Balancing statistics configuration */
 #define RKISP1_CIF_ISP_MODULE_AWB		(1U << 12)
 /* Image Effect */
 #define RKISP1_CIF_ISP_MODULE_IE		(1U << 13)
-/* Auto Exposure Control */
+/* Auto Exposure Control statistics configuration */
 #define RKISP1_CIF_ISP_MODULE_AEC		(1U << 14)
 /* Wide Dynamic Range */
 #define RKISP1_CIF_ISP_MODULE_WDR		(1U << 15)
@@ -411,7 +411,7 @@ struct rkisp1_cif_isp_cproc_config {
 };
 
 /**
- * struct rkisp1_cif_isp_awb_meas_config - Configuration used by auto white balance
+ * struct rkisp1_cif_isp_awb_meas_config - Configuration for the AWB statistics
  *
  * @awb_mode: the awb meas mode. From enum rkisp1_cif_isp_awb_mode_type.
  * @awb_wnd: white balance measurement window (in pixels)
@@ -550,7 +550,7 @@ struct rkisp1_cif_isp_goc_config {
 };
 
 /**
- * struct rkisp1_cif_isp_hst_config - Configuration used by Histogram
+ * struct rkisp1_cif_isp_hst_config - Configuration for Histogram statistics
  *
  * @mode: histogram mode (from enum rkisp1_cif_isp_histogram_mode)
  * @histogram_predivider: process every stepsize pixel, all other pixels are
@@ -575,7 +575,7 @@ struct rkisp1_cif_isp_hst_config {
 };
 
 /**
- * struct rkisp1_cif_isp_aec_config - Configuration used by Auto Exposure Control
+ * struct rkisp1_cif_isp_aec_config - Configuration for Auto Exposure statistics
  *
  * @mode: Exposure measure mode (from enum rkisp1_cif_isp_exp_meas_mode)
  * @autostop: stop mode (from enum rkisp1_cif_isp_exp_ctrl_autostop)
@@ -588,7 +588,7 @@ struct rkisp1_cif_isp_aec_config {
 };
 
 /**
- * struct rkisp1_cif_isp_afc_config - Configuration used by Auto Focus Control
+ * struct rkisp1_cif_isp_afc_config - Configuration for the Auto Focus statistics
  *
  * @num_afm_win: max RKISP1_CIF_ISP_AFM_MAX_WINDOWS
  * @afm_win: coordinates of the meas window
-- 
cgit v1.2.3-71-gd317


From f15c54cf3f684cd1a65f6ebc55ee9ada533ec6ef Mon Sep 17 00:00:00 2001
From: Dikshita Agarwal <dikshita@codeaurora.org>
Date: Wed, 24 Mar 2021 10:54:31 +0100
Subject: media: v4l2-ctrl: add controls for long term reference.

Long Term Reference (LTR) frames are the frames that are encoded
sometime in the past and stored in the DPB buffer list to be used
as reference to encode future frames.
This change adds controls to enable this feature.

Signed-off-by: Dikshita Agarwal <dikshita@codeaurora.org>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst        | 18 ++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ctrls.c                   | 14 ++++++++++++++
 include/uapi/linux/v4l2-controls.h                     |  3 +++
 3 files changed, 35 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 188aef8e40d0..6af3774fbb75 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -3427,3 +3427,21 @@ enum v4l2_mpeg_video_hevc_size_of_length_field -
     so this has to come from client.
     This is applicable to H264 and valid Range is from 0 to 63.
     Source Rec. ITU-T H.264 (06/2019); G.7.4.1.1, G.8.8.1.
+
+``V4L2_CID_MPEG_VIDEO_LTR_COUNT (integer)``
+    Specifies the maximum number of Long Term Reference (LTR) frames at any
+    given time that the encoder can keep.
+    This is applicable to the H264 and HEVC encoders.
+
+``V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX (integer)``
+    After setting this control the frame that will be queued next
+    will be marked as a Long Term Reference (LTR) frame
+    and given this LTR index which ranges from 0 to LTR_COUNT-1.
+    This is applicable to the H264 and HEVC encoders.
+    Source Rec. ITU-T H.264 (06/2019); Table 7.9
+
+``V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES (bitmask)``
+    Specifies the Long Term Reference (LTR) frame(s) to be used for
+    encoding the next frame queued after setting this control.
+    This provides a bitmask which consists of bits [0, LTR_COUNT-1].
+    This is applicable to the H264 and HEVC encoders.
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 39038c6ad8fb..ab5d33c8b239 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -961,6 +961,9 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_REPEAT_SEQ_HEADER:		return "Repeat Sequence Header";
 	case V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME:		return "Force Key Frame";
 	case V4L2_CID_MPEG_VIDEO_BASELAYER_PRIORITY_ID:		return "Base Layer Priority ID";
+	case V4L2_CID_MPEG_VIDEO_LTR_COUNT:			return "LTR Count";
+	case V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX:		return "Frame LTR Index";
+	case V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES:		return "Use LTR Frames";
 	case V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS:		return "MPEG-2 Slice Parameters";
 	case V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION:		return "MPEG-2 Quantization Matrices";
 	case V4L2_CID_FWHT_I_FRAME_QP:				return "FWHT I-Frame QP Value";
@@ -1291,6 +1294,17 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY:
 		*type = V4L2_CTRL_TYPE_INTEGER;
 		break;
+	case V4L2_CID_MPEG_VIDEO_LTR_COUNT:
+		*type = V4L2_CTRL_TYPE_INTEGER;
+		break;
+	case V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX:
+		*type = V4L2_CTRL_TYPE_INTEGER;
+		*flags |= V4L2_CTRL_FLAG_EXECUTE_ON_WRITE;
+		break;
+	case V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES:
+		*type = V4L2_CTRL_TYPE_BITMASK;
+		*flags |= V4L2_CTRL_FLAG_EXECUTE_ON_WRITE;
+		break;
 	case V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME:
 	case V4L2_CID_PAN_RESET:
 	case V4L2_CID_TILT_RESET:
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index f3376aafea65..47678e509839 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -429,6 +429,9 @@ enum v4l2_mpeg_video_multi_slice_mode {
 #define V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME		(V4L2_CID_CODEC_BASE+229)
 #define V4L2_CID_MPEG_VIDEO_BASELAYER_PRIORITY_ID	(V4L2_CID_CODEC_BASE+230)
 #define V4L2_CID_MPEG_VIDEO_AU_DELIMITER		(V4L2_CID_CODEC_BASE+231)
+#define V4L2_CID_MPEG_VIDEO_LTR_COUNT			(V4L2_CID_CODEC_BASE+232)
+#define V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX		(V4L2_CID_CODEC_BASE+233)
+#define V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES		(V4L2_CID_CODEC_BASE+234)
 
 /* CIDs for the MPEG-2 Part 2 (H.262) codec */
 #define V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL			(V4L2_CID_CODEC_BASE+270)
-- 
cgit v1.2.3-71-gd317


From b52051a40908b3867fcab077d4afda47e1bd4c1b Mon Sep 17 00:00:00 2001
From: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Date: Wed, 20 Jan 2021 12:09:48 +0100
Subject: media: v4l2-ctrl: Add decoder conceal color control

Add decoder v4l2 control to set conceal color.

Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 .../userspace-api/media/v4l/ext-ctrls-codec.rst    | 33 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ctrls.c               |  9 ++++++
 include/uapi/linux/v4l2-controls.h                 |  1 +
 3 files changed, 43 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 6af3774fbb75..b0de4e6e7ebd 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -674,6 +674,39 @@ enum v4l2_mpeg_video_frame_skip_mode -
     is currently displayed (decoded). This value is reset to 0 whenever
     the decoder is started.
 
+``V4L2_CID_MPEG_VIDEO_DEC_CONCEAL_COLOR (integer64)``
+    This control sets the conceal color in YUV color space. It describes
+    the client preference of the error conceal color in case of an error
+    where the reference frame is missing. The decoder should fill the
+    reference buffer with the preferred color and use it for future
+    decoding. The control is using 16 bits per channel.
+    Applicable to decoders.
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+
+    * -
+      - 8bit  format
+      - 10bit format
+      - 12bit format
+    * - Y luminance
+      - Bit 0:7
+      - Bit 0:9
+      - Bit 0:11
+    * - Cb chrominance
+      - Bit 16:23
+      - Bit 16:25
+      - Bit 16:27
+    * - Cr chrominance
+      - Bit 32:39
+      - Bit 32:41
+      - Bit 32:43
+    * - Must be zero
+      - Bit 48:63
+      - Bit 48:63
+      - Bit 48:63
+
 ``V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE (boolean)``
     If enabled the decoder expects to receive a single slice per buffer,
     otherwise the decoder expects a single frame in per buffer.
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index ab5d33c8b239..628fbeba8fa2 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -955,6 +955,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_VBV_SIZE:			return "VBV Buffer Size";
 	case V4L2_CID_MPEG_VIDEO_DEC_PTS:			return "Video Decoder PTS";
 	case V4L2_CID_MPEG_VIDEO_DEC_FRAME:			return "Video Decoder Frame Count";
+	case V4L2_CID_MPEG_VIDEO_DEC_CONCEAL_COLOR:		return "Video Decoder Conceal Color";
 	case V4L2_CID_MPEG_VIDEO_VBV_DELAY:			return "Initial Delay for VBV Control";
 	case V4L2_CID_MPEG_VIDEO_MV_H_SEARCH_RANGE:		return "Horizontal MV Search Range";
 	case V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE:		return "Vertical MV Search Range";
@@ -1458,6 +1459,14 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 		*max = 0x7fffffffffffffffLL;
 		*step = 1;
 		break;
+	case V4L2_CID_MPEG_VIDEO_DEC_CONCEAL_COLOR:
+		*type = V4L2_CTRL_TYPE_INTEGER64;
+		*min = 0;
+		/* default for 8 bit black, luma is 16, chroma is 128 */
+		*def = 0x8000800010LL;
+		*max = 0xffffffffffffLL;
+		*step = 1;
+		break;
 	case V4L2_CID_PIXEL_RATE:
 		*type = V4L2_CTRL_TYPE_INTEGER64;
 		*flags |= V4L2_CTRL_FLAG_READ_ONLY;
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 47678e509839..6bae78183547 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -432,6 +432,7 @@ enum v4l2_mpeg_video_multi_slice_mode {
 #define V4L2_CID_MPEG_VIDEO_LTR_COUNT			(V4L2_CID_CODEC_BASE+232)
 #define V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX		(V4L2_CID_CODEC_BASE+233)
 #define V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES		(V4L2_CID_CODEC_BASE+234)
+#define V4L2_CID_MPEG_VIDEO_DEC_CONCEAL_COLOR		(V4L2_CID_CODEC_BASE+235)
 
 /* CIDs for the MPEG-2 Part 2 (H.262) codec */
 #define V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL			(V4L2_CID_CODEC_BASE+270)
-- 
cgit v1.2.3-71-gd317


From f31b9ffd968bc07e78826814da01e164e0bf6485 Mon Sep 17 00:00:00 2001
From: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Date: Tue, 19 Jan 2021 15:04:22 +0100
Subject: media: v4l: Add new Colorimetry Class

Add Colorimetry control class for colorimetry controls

Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ctrls.c | 7 ++++++-
 include/uapi/linux/v4l2-controls.h   | 4 ++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 628fbeba8fa2..89780c5a78e1 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -1215,6 +1215,10 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_STATELESS_H264_DECODE_PARAMS:		return "H264 Decode Parameters";
 	case V4L2_CID_STATELESS_FWHT_PARAMS:			return "FWHT Stateless Parameters";
 	case V4L2_CID_STATELESS_VP8_FRAME:			return "VP8 Frame Parameters";
+
+	/* Colorimetry controls */
+	/* Keep the order of the 'case's the same as in v4l2-controls.h! */
+	case V4L2_CID_COLORIMETRY_CLASS:	return "Colorimetry Controls";
 	default:
 		return NULL;
 	}
@@ -1418,8 +1422,9 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_RF_TUNER_CLASS:
 	case V4L2_CID_DETECT_CLASS:
 	case V4L2_CID_CODEC_STATELESS_CLASS:
+	case V4L2_CID_COLORIMETRY_CLASS:
 		*type = V4L2_CTRL_TYPE_CTRL_CLASS;
-		/* You can neither read not write these */
+		/* You can neither read nor write these */
 		*flags |= V4L2_CTRL_FLAG_READ_ONLY | V4L2_CTRL_FLAG_WRITE_ONLY;
 		*min = *max = *step = *def = 0;
 		break;
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 6bae78183547..e376d816c56e 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -66,6 +66,7 @@
 #define V4L2_CTRL_CLASS_RF_TUNER	0x00a20000	/* RF tuner controls */
 #define V4L2_CTRL_CLASS_DETECT		0x00a30000	/* Detection controls */
 #define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000	/* Stateless codecs controls */
+#define V4L2_CTRL_CLASS_COLORIMETRY	0x00a50000	/* Colorimetry controls */
 
 /* User-class control IDs */
 
@@ -1861,6 +1862,9 @@ struct v4l2_ctrl_vp8_frame {
 	__u64 flags;
 };
 
+#define V4L2_CID_COLORIMETRY_CLASS_BASE	(V4L2_CTRL_CLASS_COLORIMETRY | 0x900)
+#define V4L2_CID_COLORIMETRY_CLASS	(V4L2_CTRL_CLASS_COLORIMETRY | 1)
+
 /* MPEG-compression definitions kept for backwards compatibility */
 #ifndef __KERNEL__
 #define V4L2_CTRL_CLASS_MPEG            V4L2_CTRL_CLASS_CODEC
-- 
cgit v1.2.3-71-gd317


From 1ad0de78e7944eef171340d9fa00f0a59458991c Mon Sep 17 00:00:00 2001
From: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Date: Tue, 19 Jan 2021 16:06:42 +0100
Subject: media: v4l: Add HDR10 static metadata controls

Introduce Content light level and Mastering display colour
volume Colorimetry compound controls with relevant payload
structures and validation.

Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ctrls.c | 68 ++++++++++++++++++++++++++++++++++++
 include/media/v4l2-ctrls.h           |  4 +++
 include/uapi/linux/v4l2-controls.h   | 31 ++++++++++++++++
 include/uapi/linux/videodev2.h       |  3 ++
 4 files changed, 106 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 89780c5a78e1..e7fcbaeec0ae 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -1219,6 +1219,8 @@ const char *v4l2_ctrl_get_name(u32 id)
 	/* Colorimetry controls */
 	/* Keep the order of the 'case's the same as in v4l2-controls.h! */
 	case V4L2_CID_COLORIMETRY_CLASS:	return "Colorimetry Controls";
+	case V4L2_CID_COLORIMETRY_HDR10_CLL_INFO:		return "HDR10 Content Light Info";
+	case V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY:	return "HDR10 Mastering Display";
 	default:
 		return NULL;
 	}
@@ -1528,6 +1530,12 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 		*type = V4L2_CTRL_TYPE_AREA;
 		*flags |= V4L2_CTRL_FLAG_READ_ONLY;
 		break;
+	case V4L2_CID_COLORIMETRY_HDR10_CLL_INFO:
+		*type = V4L2_CTRL_TYPE_HDR10_CLL_INFO;
+		break;
+	case V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY:
+		*type = V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY;
+		break;
 	default:
 		*type = V4L2_CTRL_TYPE_INTEGER;
 		break;
@@ -1828,6 +1836,12 @@ static void std_log(const struct v4l2_ctrl *ctrl)
 	case V4L2_CTRL_TYPE_VP8_FRAME:
 		pr_cont("VP8_FRAME");
 		break;
+	case V4L2_CTRL_TYPE_HDR10_CLL_INFO:
+		pr_cont("HDR10_CLL_INFO");
+		break;
+	case V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY:
+		pr_cont("HDR10_MASTERING_DISPLAY");
+		break;
 	default:
 		pr_cont("unknown type %d", ctrl->type);
 		break;
@@ -1880,6 +1894,7 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx,
 	struct v4l2_ctrl_hevc_sps *p_hevc_sps;
 	struct v4l2_ctrl_hevc_pps *p_hevc_pps;
 	struct v4l2_ctrl_hevc_slice_params *p_hevc_slice_params;
+	struct v4l2_ctrl_hdr10_mastering_display *p_hdr10_mastering;
 	struct v4l2_area *area;
 	void *p = ptr.p + idx * ctrl->elem_size;
 	unsigned int i;
@@ -2175,6 +2190,53 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx,
 		zero_padding(*p_hevc_slice_params);
 		break;
 
+	case V4L2_CTRL_TYPE_HDR10_CLL_INFO:
+		break;
+
+	case V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY:
+		p_hdr10_mastering = p;
+
+		for (i = 0; i < 3; ++i) {
+			if (p_hdr10_mastering->display_primaries_x[i] <
+				V4L2_HDR10_MASTERING_PRIMARIES_X_LOW ||
+			    p_hdr10_mastering->display_primaries_x[i] >
+				V4L2_HDR10_MASTERING_PRIMARIES_X_HIGH ||
+			    p_hdr10_mastering->display_primaries_y[i] <
+				V4L2_HDR10_MASTERING_PRIMARIES_Y_LOW ||
+			    p_hdr10_mastering->display_primaries_y[i] >
+				V4L2_HDR10_MASTERING_PRIMARIES_Y_HIGH)
+				return -EINVAL;
+		}
+
+		if (p_hdr10_mastering->white_point_x <
+			V4L2_HDR10_MASTERING_WHITE_POINT_X_LOW ||
+		    p_hdr10_mastering->white_point_x >
+			V4L2_HDR10_MASTERING_WHITE_POINT_X_HIGH ||
+		    p_hdr10_mastering->white_point_y <
+			V4L2_HDR10_MASTERING_WHITE_POINT_Y_LOW ||
+		    p_hdr10_mastering->white_point_y >
+			V4L2_HDR10_MASTERING_WHITE_POINT_Y_HIGH)
+			return -EINVAL;
+
+		if (p_hdr10_mastering->max_display_mastering_luminance <
+			V4L2_HDR10_MASTERING_MAX_LUMA_LOW ||
+		    p_hdr10_mastering->max_display_mastering_luminance >
+			V4L2_HDR10_MASTERING_MAX_LUMA_HIGH ||
+		    p_hdr10_mastering->min_display_mastering_luminance <
+			V4L2_HDR10_MASTERING_MIN_LUMA_LOW ||
+		    p_hdr10_mastering->min_display_mastering_luminance >
+			V4L2_HDR10_MASTERING_MIN_LUMA_HIGH)
+			return -EINVAL;
+
+		/* The following restriction comes from ITU-T Rec. H.265 spec */
+		if (p_hdr10_mastering->max_display_mastering_luminance ==
+			V4L2_HDR10_MASTERING_MAX_LUMA_LOW &&
+		    p_hdr10_mastering->min_display_mastering_luminance ==
+			V4L2_HDR10_MASTERING_MIN_LUMA_HIGH)
+			return -EINVAL;
+
+		break;
+
 	case V4L2_CTRL_TYPE_AREA:
 		area = p;
 		if (!area->width || !area->height)
@@ -2868,6 +2930,12 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl,
 	case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS:
 		elem_size = sizeof(struct v4l2_ctrl_hevc_slice_params);
 		break;
+	case V4L2_CTRL_TYPE_HDR10_CLL_INFO:
+		elem_size = sizeof(struct v4l2_ctrl_hdr10_cll_info);
+		break;
+	case V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY:
+		elem_size = sizeof(struct v4l2_ctrl_hdr10_mastering_display);
+		break;
 	case V4L2_CTRL_TYPE_AREA:
 		elem_size = sizeof(struct v4l2_area);
 		break;
diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h
index dea65c92e86b..c1d20bd8f25f 100644
--- a/include/media/v4l2-ctrls.h
+++ b/include/media/v4l2-ctrls.h
@@ -53,6 +53,8 @@ struct video_device;
  * @p_hevc_sps:			Pointer to an HEVC sequence parameter set structure.
  * @p_hevc_pps:			Pointer to an HEVC picture parameter set structure.
  * @p_hevc_slice_params:	Pointer to an HEVC slice parameters structure.
+ * @p_hdr10_cll:		Pointer to an HDR10 Content Light Level structure.
+ * @p_hdr10_mastering:		Pointer to an HDR10 Mastering Display structure.
  * @p_area:			Pointer to an area.
  * @p:				Pointer to a compound value.
  * @p_const:			Pointer to a constant compound value.
@@ -77,6 +79,8 @@ union v4l2_ctrl_ptr {
 	struct v4l2_ctrl_hevc_sps *p_hevc_sps;
 	struct v4l2_ctrl_hevc_pps *p_hevc_pps;
 	struct v4l2_ctrl_hevc_slice_params *p_hevc_slice_params;
+	struct v4l2_ctrl_hdr10_cll_info *p_hdr10_cll;
+	struct v4l2_ctrl_hdr10_mastering_display *p_hdr10_mastering;
 	struct v4l2_area *p_area;
 	void *p;
 	const void *p_const;
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index e376d816c56e..d43bec5f1afd 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -1865,6 +1865,37 @@ struct v4l2_ctrl_vp8_frame {
 #define V4L2_CID_COLORIMETRY_CLASS_BASE	(V4L2_CTRL_CLASS_COLORIMETRY | 0x900)
 #define V4L2_CID_COLORIMETRY_CLASS	(V4L2_CTRL_CLASS_COLORIMETRY | 1)
 
+#define V4L2_CID_COLORIMETRY_HDR10_CLL_INFO	(V4L2_CID_COLORIMETRY_CLASS_BASE + 0)
+
+struct v4l2_ctrl_hdr10_cll_info {
+	__u16 max_content_light_level;
+	__u16 max_pic_average_light_level;
+};
+
+#define V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY	(V4L2_CID_COLORIMETRY_CLASS_BASE + 1)
+
+#define V4L2_HDR10_MASTERING_PRIMARIES_X_LOW	5
+#define V4L2_HDR10_MASTERING_PRIMARIES_X_HIGH	37000
+#define V4L2_HDR10_MASTERING_PRIMARIES_Y_LOW	5
+#define V4L2_HDR10_MASTERING_PRIMARIES_Y_HIGH	42000
+#define V4L2_HDR10_MASTERING_WHITE_POINT_X_LOW	5
+#define V4L2_HDR10_MASTERING_WHITE_POINT_X_HIGH	37000
+#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_LOW	5
+#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_HIGH	42000
+#define V4L2_HDR10_MASTERING_MAX_LUMA_LOW	50000
+#define V4L2_HDR10_MASTERING_MAX_LUMA_HIGH	100000000
+#define V4L2_HDR10_MASTERING_MIN_LUMA_LOW	1
+#define V4L2_HDR10_MASTERING_MIN_LUMA_HIGH	50000
+
+struct v4l2_ctrl_hdr10_mastering_display {
+	__u16 display_primaries_x[3];
+	__u16 display_primaries_y[3];
+	__u16 white_point_x;
+	__u16 white_point_y;
+	__u32 max_display_mastering_luminance;
+	__u32 min_display_mastering_luminance;
+};
+
 /* MPEG-compression definitions kept for backwards compatibility */
 #ifndef __KERNEL__
 #define V4L2_CTRL_CLASS_MPEG            V4L2_CTRL_CLASS_CODEC
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 8d15f6ccc4b4..311a01cc5775 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1794,6 +1794,9 @@ enum v4l2_ctrl_type {
 	V4L2_CTRL_TYPE_U32	     = 0x0102,
 	V4L2_CTRL_TYPE_AREA          = 0x0106,
 
+	V4L2_CTRL_TYPE_HDR10_CLL_INFO		= 0x0110,
+	V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY	= 0x0111,
+
 	V4L2_CTRL_TYPE_H264_SPS             = 0x0200,
 	V4L2_CTRL_TYPE_H264_PPS		    = 0x0201,
 	V4L2_CTRL_TYPE_H264_SCALING_MATRIX  = 0x0202,
-- 
cgit v1.2.3-71-gd317


From b392a198917020cac996fd207355211ecfcfad84 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 30 Mar 2021 10:01:18 -0600
Subject: vfio/pci: remove vfio_pci_nvlink2

This driver never had any open userspace (which for VFIO would include
VM kernel drivers) that use it, and thus should never have been added
by our normal userspace ABI rules.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Message-Id: <20210326061311.1497642-2-hch@lst.de>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/Kconfig            |   6 -
 drivers/vfio/pci/Makefile           |   1 -
 drivers/vfio/pci/vfio_pci.c         |  18 --
 drivers/vfio/pci/vfio_pci_nvlink2.c | 490 ------------------------------------
 drivers/vfio/pci/vfio_pci_private.h |  14 --
 include/uapi/linux/vfio.h           |  38 +--
 6 files changed, 4 insertions(+), 563 deletions(-)
 delete mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c

(limited to 'include/uapi/linux')

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 4abddbebd4b2..53ce78d7d07b 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -39,9 +39,3 @@ config VFIO_PCI_IGD
 	  and LPC bridge config space.
 
 	  To enable Intel IGD assignment through vfio-pci, say Y.
-
-config VFIO_PCI_NVLINK2
-	def_bool y
-	depends on VFIO_PCI && PPC_POWERNV && SPAPR_TCE_IOMMU
-	help
-	  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index eff97a7cd9f1..3ff42093962f 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,7 +2,6 @@
 
 vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
-vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
 vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 65e7e6b44578..d691006b6428 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -389,24 +389,6 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
 		}
 	}
 
-	if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
-		ret = vfio_pci_nvdia_v100_nvlink2_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
-			goto disable_exit;
-		}
-	}
-
-	if (pdev->vendor == PCI_VENDOR_ID_IBM &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
-		ret = vfio_pci_ibm_npu2_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
-			goto disable_exit;
-		}
-	}
-
 	vfio_pci_probe_mmaps(vdev);
 
 	return 0;
diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
deleted file mode 100644
index 9adcf6a8f888..000000000000
--- a/drivers/vfio/pci/vfio_pci_nvlink2.c
+++ /dev/null
@@ -1,490 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
- *
- * Copyright (C) 2018 IBM Corp.  All rights reserved.
- *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
- *
- * Register an on-GPU RAM region for cacheable access.
- *
- * Derived from original vfio_pci_igd.c:
- * Copyright (C) 2016 Red Hat, Inc.  All rights reserved.
- *	Author: Alex Williamson <alex.williamson@redhat.com>
- */
-
-#include <linux/io.h>
-#include <linux/pci.h>
-#include <linux/uaccess.h>
-#include <linux/vfio.h>
-#include <linux/sched/mm.h>
-#include <linux/mmu_context.h>
-#include <asm/kvm_ppc.h>
-#include "vfio_pci_private.h"
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault);
-EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap);
-EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap);
-
-struct vfio_pci_nvgpu_data {
-	unsigned long gpu_hpa; /* GPU RAM physical address */
-	unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */
-	unsigned long useraddr; /* GPU RAM userspace address */
-	unsigned long size; /* Size of the GPU RAM window (usually 128GB) */
-	struct mm_struct *mm;
-	struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */
-	struct pci_dev *gpdev;
-	struct notifier_block group_notifier;
-};
-
-static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev,
-		char __user *buf, size_t count, loff_t *ppos, bool iswrite)
-{
-	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
-	struct vfio_pci_nvgpu_data *data = vdev->region[i].data;
-	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
-	loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK;
-	size_t sizealigned;
-	void __iomem *ptr;
-
-	if (pos >= vdev->region[i].size)
-		return -EINVAL;
-
-	count = min(count, (size_t)(vdev->region[i].size - pos));
-
-	/*
-	 * We map only a bit of GPU RAM for a short time instead of mapping it
-	 * for the guest lifetime as:
-	 *
-	 * 1) we do not know GPU RAM size, only aperture which is 4-8 times
-	 *    bigger than actual RAM size (16/32GB RAM vs. 128GB aperture);
-	 * 2) mapping GPU RAM allows CPU to prefetch and if this happens
-	 *    before NVLink bridge is reset (which fences GPU RAM),
-	 *    hardware management interrupts (HMI) might happen, this
-	 *    will freeze NVLink bridge.
-	 *
-	 * This is not fast path anyway.
-	 */
-	sizealigned = ALIGN(posoff + count, PAGE_SIZE);
-	ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned);
-	if (!ptr)
-		return -EFAULT;
-
-	if (iswrite) {
-		if (copy_from_user(ptr + posoff, buf, count))
-			count = -EFAULT;
-		else
-			*ppos += count;
-	} else {
-		if (copy_to_user(buf, ptr + posoff, count))
-			count = -EFAULT;
-		else
-			*ppos += count;
-	}
-
-	iounmap(ptr);
-
-	return count;
-}
-
-static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev,
-		struct vfio_pci_region *region)
-{
-	struct vfio_pci_nvgpu_data *data = region->data;
-	long ret;
-
-	/* If there were any mappings at all... */
-	if (data->mm) {
-		if (data->mem) {
-			ret = mm_iommu_put(data->mm, data->mem);
-			WARN_ON(ret);
-		}
-
-		mmdrop(data->mm);
-	}
-
-	vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
-			&data->group_notifier);
-
-	pnv_npu2_unmap_lpar_dev(data->gpdev);
-
-	kfree(data);
-}
-
-static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf)
-{
-	vm_fault_t ret;
-	struct vm_area_struct *vma = vmf->vma;
-	struct vfio_pci_region *region = vma->vm_private_data;
-	struct vfio_pci_nvgpu_data *data = region->data;
-	unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
-	unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT;
-	unsigned long vm_pgoff = vma->vm_pgoff &
-		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
-	unsigned long pfn = nv2pg + vm_pgoff + vmf_off;
-
-	ret = vmf_insert_pfn(vma, vmf->address, pfn);
-	trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT,
-			vmf->address, ret);
-
-	return ret;
-}
-
-static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = {
-	.fault = vfio_pci_nvgpu_mmap_fault,
-};
-
-static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev,
-		struct vfio_pci_region *region, struct vm_area_struct *vma)
-{
-	int ret;
-	struct vfio_pci_nvgpu_data *data = region->data;
-
-	if (data->useraddr)
-		return -EPERM;
-
-	if (vma->vm_end - vma->vm_start > data->size)
-		return -EINVAL;
-
-	vma->vm_private_data = region;
-	vma->vm_flags |= VM_PFNMAP;
-	vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops;
-
-	/*
-	 * Calling mm_iommu_newdev() here once as the region is not
-	 * registered yet and therefore right initialization will happen now.
-	 * Other places will use mm_iommu_find() which returns
-	 * registered @mem and does not go gup().
-	 */
-	data->useraddr = vma->vm_start;
-	data->mm = current->mm;
-
-	mmgrab(data->mm);
-	ret = (int) mm_iommu_newdev(data->mm, data->useraddr,
-			vma_pages(vma), data->gpu_hpa, &data->mem);
-
-	trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr,
-			vma->vm_end - vma->vm_start, ret);
-
-	return ret;
-}
-
-static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev,
-		struct vfio_pci_region *region, struct vfio_info_cap *caps)
-{
-	struct vfio_pci_nvgpu_data *data = region->data;
-	struct vfio_region_info_cap_nvlink2_ssatgt cap = {
-		.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT,
-		.header.version = 1,
-		.tgt = data->gpu_tgt
-	};
-
-	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
-}
-
-static const struct vfio_pci_regops vfio_pci_nvgpu_regops = {
-	.rw = vfio_pci_nvgpu_rw,
-	.release = vfio_pci_nvgpu_release,
-	.mmap = vfio_pci_nvgpu_mmap,
-	.add_capability = vfio_pci_nvgpu_add_capability,
-};
-
-static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb,
-		unsigned long action, void *opaque)
-{
-	struct kvm *kvm = opaque;
-	struct vfio_pci_nvgpu_data *data = container_of(nb,
-			struct vfio_pci_nvgpu_data,
-			group_notifier);
-
-	if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm &&
-			pnv_npu2_map_lpar_dev(data->gpdev,
-				kvm->arch.lpid, MSR_DR | MSR_PR))
-		return NOTIFY_BAD;
-
-	return NOTIFY_OK;
-}
-
-int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
-{
-	int ret;
-	u64 reg[2];
-	u64 tgt = 0;
-	struct device_node *npu_node, *mem_node;
-	struct pci_dev *npu_dev;
-	struct vfio_pci_nvgpu_data *data;
-	uint32_t mem_phandle = 0;
-	unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM;
-
-	/*
-	 * PCI config space does not tell us about NVLink presense but
-	 * platform does, use this.
-	 */
-	npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0);
-	if (!npu_dev)
-		return -ENODEV;
-
-	npu_node = pci_device_to_OF_node(npu_dev);
-	if (!npu_node)
-		return -EINVAL;
-
-	if (of_property_read_u32(npu_node, "memory-region", &mem_phandle))
-		return -ENODEV;
-
-	mem_node = of_find_node_by_phandle(mem_phandle);
-	if (!mem_node)
-		return -EINVAL;
-
-	if (of_property_read_variable_u64_array(mem_node, "reg", reg,
-				ARRAY_SIZE(reg), ARRAY_SIZE(reg)) !=
-			ARRAY_SIZE(reg))
-		return -EINVAL;
-
-	if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
-		dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n");
-		return -EFAULT;
-	}
-
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	data->gpu_hpa = reg[0];
-	data->gpu_tgt = tgt;
-	data->size = reg[1];
-
-	dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa,
-			data->gpu_hpa + data->size - 1);
-
-	data->gpdev = vdev->pdev;
-	data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier;
-
-	ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
-			&events, &data->group_notifier);
-	if (ret)
-		goto free_exit;
-
-	/*
-	 * We have just set KVM, we do not need the listener anymore.
-	 * Also, keeping it registered means that if more than one GPU is
-	 * assigned, we will get several similar notifiers notifying about
-	 * the same device again which does not help with anything.
-	 */
-	vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
-			&data->group_notifier);
-
-	ret = vfio_pci_register_dev_region(vdev,
-			PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
-			VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
-			&vfio_pci_nvgpu_regops,
-			data->size,
-			VFIO_REGION_INFO_FLAG_READ |
-			VFIO_REGION_INFO_FLAG_WRITE |
-			VFIO_REGION_INFO_FLAG_MMAP,
-			data);
-	if (ret)
-		goto free_exit;
-
-	return 0;
-free_exit:
-	kfree(data);
-
-	return ret;
-}
-
-/*
- * IBM NPU2 bridge
- */
-struct vfio_pci_npu2_data {
-	void *base; /* ATSD register virtual address, for emulated access */
-	unsigned long mmio_atsd; /* ATSD physical address */
-	unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */
-	unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */
-};
-
-static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev,
-		char __user *buf, size_t count, loff_t *ppos, bool iswrite)
-{
-	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
-	struct vfio_pci_npu2_data *data = vdev->region[i].data;
-	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
-
-	if (pos >= vdev->region[i].size)
-		return -EINVAL;
-
-	count = min(count, (size_t)(vdev->region[i].size - pos));
-
-	if (iswrite) {
-		if (copy_from_user(data->base + pos, buf, count))
-			return -EFAULT;
-	} else {
-		if (copy_to_user(buf, data->base + pos, count))
-			return -EFAULT;
-	}
-	*ppos += count;
-
-	return count;
-}
-
-static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev,
-		struct vfio_pci_region *region, struct vm_area_struct *vma)
-{
-	int ret;
-	struct vfio_pci_npu2_data *data = region->data;
-	unsigned long req_len = vma->vm_end - vma->vm_start;
-
-	if (req_len != PAGE_SIZE)
-		return -EINVAL;
-
-	vma->vm_flags |= VM_PFNMAP;
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-	ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT,
-			req_len, vma->vm_page_prot);
-	trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start,
-			vma->vm_end - vma->vm_start, ret);
-
-	return ret;
-}
-
-static void vfio_pci_npu2_release(struct vfio_pci_device *vdev,
-		struct vfio_pci_region *region)
-{
-	struct vfio_pci_npu2_data *data = region->data;
-
-	memunmap(data->base);
-	kfree(data);
-}
-
-static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev,
-		struct vfio_pci_region *region, struct vfio_info_cap *caps)
-{
-	struct vfio_pci_npu2_data *data = region->data;
-	struct vfio_region_info_cap_nvlink2_ssatgt captgt = {
-		.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT,
-		.header.version = 1,
-		.tgt = data->gpu_tgt
-	};
-	struct vfio_region_info_cap_nvlink2_lnkspd capspd = {
-		.header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD,
-		.header.version = 1,
-		.link_speed = data->link_speed
-	};
-	int ret;
-
-	ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt));
-	if (ret)
-		return ret;
-
-	return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd));
-}
-
-static const struct vfio_pci_regops vfio_pci_npu2_regops = {
-	.rw = vfio_pci_npu2_rw,
-	.mmap = vfio_pci_npu2_mmap,
-	.release = vfio_pci_npu2_release,
-	.add_capability = vfio_pci_npu2_add_capability,
-};
-
-int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
-{
-	int ret;
-	struct vfio_pci_npu2_data *data;
-	struct device_node *nvlink_dn;
-	u32 nvlink_index = 0, mem_phandle = 0;
-	struct pci_dev *npdev = vdev->pdev;
-	struct device_node *npu_node = pci_device_to_OF_node(npdev);
-	struct pci_controller *hose = pci_bus_to_host(npdev->bus);
-	u64 mmio_atsd = 0;
-	u64 tgt = 0;
-	u32 link_speed = 0xff;
-
-	/*
-	 * PCI config space does not tell us about NVLink presense but
-	 * platform does, use this.
-	 */
-	if (!pnv_pci_get_gpu_dev(vdev->pdev))
-		return -ENODEV;
-
-	if (of_property_read_u32(npu_node, "memory-region", &mem_phandle))
-		return -ENODEV;
-
-	/*
-	 * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links
-	 * so we can allocate one register per link, using nvlink index as
-	 * a key.
-	 * There is always at least one ATSD register so as long as at least
-	 * NVLink bridge #0 is passed to the guest, ATSD will be available.
-	 */
-	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
-	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
-			&nvlink_index)))
-		return -ENODEV;
-
-	if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index,
-			&mmio_atsd)) {
-		if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", 0,
-				&mmio_atsd)) {
-			dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
-			mmio_atsd = 0;
-		} else {
-			dev_warn(&vdev->pdev->dev,
-				 "Using fallback ibm,mmio-atsd[0] for ATSD.\n");
-		}
-	}
-
-	if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
-		dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n");
-		return -EFAULT;
-	}
-
-	if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) {
-		dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n");
-		return -EFAULT;
-	}
-
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	data->mmio_atsd = mmio_atsd;
-	data->gpu_tgt = tgt;
-	data->link_speed = link_speed;
-	if (data->mmio_atsd) {
-		data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT);
-		if (!data->base) {
-			ret = -ENOMEM;
-			goto free_exit;
-		}
-	}
-
-	/*
-	 * We want to expose the capability even if this specific NVLink
-	 * did not get its own ATSD register because capabilities
-	 * belong to VFIO regions and normally there will be ATSD register
-	 * assigned to the NVLink bridge.
-	 */
-	ret = vfio_pci_register_dev_region(vdev,
-			PCI_VENDOR_ID_IBM |
-			VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
-			VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
-			&vfio_pci_npu2_regops,
-			data->mmio_atsd ? PAGE_SIZE : 0,
-			VFIO_REGION_INFO_FLAG_READ |
-			VFIO_REGION_INFO_FLAG_WRITE |
-			VFIO_REGION_INFO_FLAG_MMAP,
-			data);
-	if (ret)
-		goto free_exit;
-
-	return 0;
-
-free_exit:
-	if (data->base)
-		memunmap(data->base);
-	kfree(data);
-
-	return ret;
-}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 9cd1882a05af..cdae2e4cf11c 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -199,20 +199,6 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev)
 	return -ENODEV;
 }
 #endif
-#ifdef CONFIG_VFIO_PCI_NVLINK2
-extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev);
-extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev);
-#else
-static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
-{
-	return -ENODEV;
-}
-
-static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
-{
-	return -ENODEV;
-}
-#endif
 
 #ifdef CONFIG_S390
 extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 8ce36c1d53ca..34b1f53a3901 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -333,17 +333,10 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
 
 /* 10de vendor PCI sub-types */
-/*
- * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
- */
-#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM	(1)
+/* subtype 1 was VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, don't use */
 
 /* 1014 vendor PCI sub-types */
-/*
- * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
- * to do TLB invalidation on a GPU.
- */
-#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
+/* subtype 1 was VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, don't use */
 
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
@@ -637,32 +630,9 @@ struct vfio_device_migration_info {
  */
 #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
 
-/*
- * Capability with compressed real address (aka SSA - small system address)
- * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing
- * and by the userspace to associate a NVLink bridge with a GPU.
- */
-#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT	4
-
-struct vfio_region_info_cap_nvlink2_ssatgt {
-	struct vfio_info_cap_header header;
-	__u64 tgt;
-};
+/* subtype 4 was VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, don't use */
 
-/*
- * Capability with an NVLink link speed. The value is read by
- * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed"
- * property in the device tree. The value is fixed in the hardware
- * and failing to provide the correct value results in the link
- * not working with no indication from the driver why.
- */
-#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD	5
-
-struct vfio_region_info_cap_nvlink2_lnkspd {
-	struct vfio_info_cap_header header;
-	__u32 link_speed;
-	__u32 __pad;
-};
+/* subtype 5 was VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, don't use */
 
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
-- 
cgit v1.2.3-71-gd317


From 3bf725699bf62494b3e179f1795f08c7d749f061 Mon Sep 17 00:00:00 2001
From: Jianyong Wu <jianyong.wu@arm.com>
Date: Wed, 9 Dec 2020 14:09:29 +0800
Subject: KVM: arm64: Add support for the KVM PTP service

Implement the hypervisor side of the KVM PTP interface.

The service offers wall time and cycle count from host to guest.
The caller must specify whether they want the host's view of
either the virtual or physical counter.

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-7-jianyong.wu@arm.com
---
 Documentation/virt/kvm/api.rst         | 10 +++++++
 Documentation/virt/kvm/arm/index.rst   |  1 +
 Documentation/virt/kvm/arm/ptp_kvm.rst | 25 ++++++++++++++++
 arch/arm64/kvm/arm.c                   |  1 +
 arch/arm64/kvm/hypercalls.c            | 53 ++++++++++++++++++++++++++++++++++
 include/linux/arm-smccc.h              | 16 ++++++++++
 include/uapi/linux/kvm.h               |  1 +
 7 files changed, 107 insertions(+)
 create mode 100644 Documentation/virt/kvm/arm/ptp_kvm.rst

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 38e327d4b479..987d99e39887 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6724,3 +6724,13 @@ vcpu_info is set.
 The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
 features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
 supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
+
+8.31 KVM_CAP_PTP_KVM
+--------------------
+
+:Architectures: arm64
+
+This capability indicates that the KVM virtual PTP service is
+supported in the host. A VMM can check whether the service is
+available to the guest on migration.
+
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index 3e2b2aba90fc..78a9b670aafe 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -10,3 +10,4 @@ ARM
    hyp-abi
    psci
    pvtime
+   ptp_kvm
diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst
new file mode 100644
index 000000000000..68cffb50d8bf
--- /dev/null
+++ b/Documentation/virt/kvm/arm/ptp_kvm.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PTP_KVM support for arm/arm64
+=============================
+
+PTP_KVM is used for high precision time sync between host and guests.
+It relies on transferring the wall clock and counter value from the
+host to the guest using a KVM-specific hypercall.
+
+* ARM_SMCCC_HYP_KVM_PTP_FUNC_ID: 0x86000001
+
+This hypercall uses the SMC32/HVC32 calling convention:
+
+ARM_SMCCC_HYP_KVM_PTP_FUNC_ID
+    =============    ==========    ==========
+    Function ID:     (uint32)      0x86000001
+    Arguments:       (uint32)      KVM_PTP_VIRT_COUNTER(0)
+                                   KVM_PTP_PHYS_COUNTER(1)
+    Return Values:   (int32)       NOT_SUPPORTED(-1) on error, or
+                     (uint32)      Upper 32 bits of wall clock time (r0)
+                     (uint32)      Lower 32 bits of wall clock time (r1)
+                     (uint32)      Upper 32 bits of counter (r2)
+                     (uint32)      Lower 32 bits of counter (r3)
+    Endianness:                    No Restrictions.
+    =============    ==========    ==========
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 7f06ba76698d..46401798c644 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_INJECT_EXT_DABT:
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_VCPU_ATTRIBUTES:
+	case KVM_CAP_PTP_KVM:
 		r = 1;
 		break;
 	case KVM_CAP_ARM_SET_DEVICE_ADDR:
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 78d32c34d49c..30da78f72b3b 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -9,6 +9,55 @@
 #include <kvm/arm_hypercalls.h>
 #include <kvm/arm_psci.h>
 
+static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
+{
+	struct system_time_snapshot systime_snapshot;
+	u64 cycles = ~0UL;
+	u32 feature;
+
+	/*
+	 * system time and counter value must captured at the same
+	 * time to keep consistency and precision.
+	 */
+	ktime_get_snapshot(&systime_snapshot);
+
+	/*
+	 * This is only valid if the current clocksource is the
+	 * architected counter, as this is the only one the guest
+	 * can see.
+	 */
+	if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
+		return;
+
+	/*
+	 * The guest selects one of the two reference counters
+	 * (virtual or physical) with the first argument of the SMCCC
+	 * call. In case the identifier is not supported, error out.
+	 */
+	feature = smccc_get_arg1(vcpu);
+	switch (feature) {
+	case KVM_PTP_VIRT_COUNTER:
+		cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2);
+		break;
+	case KVM_PTP_PHYS_COUNTER:
+		cycles = systime_snapshot.cycles;
+		break;
+	default:
+		return;
+	}
+
+	/*
+	 * This relies on the top bit of val[0] never being set for
+	 * valid values of system time, because that is *really* far
+	 * in the future (about 292 years from 1970, and at that stage
+	 * nobody will give a damn about it).
+	 */
+	val[0] = upper_32_bits(systime_snapshot.real);
+	val[1] = lower_32_bits(systime_snapshot.real);
+	val[2] = upper_32_bits(cycles);
+	val[3] = lower_32_bits(cycles);
+}
+
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
 	u32 func_id = smccc_get_function(vcpu);
@@ -79,6 +128,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 		break;
 	case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
 		val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP);
+		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+		kvm_ptp_get_time(vcpu, val);
 		break;
 	case ARM_SMCCC_TRNG_VERSION:
 	case ARM_SMCCC_TRNG_FEATURES:
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 1a27bd9493fe..6861489a1890 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -103,6 +103,7 @@
 
 /* KVM "vendor specific" services */
 #define ARM_SMCCC_KVM_FUNC_FEATURES		0
+#define ARM_SMCCC_KVM_FUNC_PTP			1
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
@@ -114,6 +115,21 @@
 
 #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED	1
 
+/*
+ * ptp_kvm is a feature used for time sync between vm and host.
+ * ptp_kvm module in guest kernel will get service from host using
+ * this hypercall ID.
+ */
+#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_32,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_PTP)
+
+/* ptp_kvm counter type ID */
+#define KVM_PTP_VIRT_COUNTER			0
+#define KVM_PTP_PHYS_COUNTER			1
+
 /* Paravirtualised time calls (defined by ARM DEN0057A) */
 #define ARM_SMCCC_HV_PV_TIME_FEATURES				\
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f6afee209620..0e0f70c0d0dc 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1078,6 +1078,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DIRTY_LOG_RING 192
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
 #define KVM_CAP_PPC_DAWR1 194
+#define KVM_CAP_PTP_KVM 195
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-71-gd317


From f0ebc2b6b7df7716749445cda26734b3826a48cf Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 6 Apr 2021 17:28:25 -0700
Subject: ethtool: un-kdocify extended link state

Extended link state structures and enums use kdoc headers
but then do not describe any of the members.

Convert to normal comments.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      |  4 +---
 include/uapi/linux/ethtool.h | 26 ++++++--------------------
 2 files changed, 7 insertions(+), 23 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index ec4cd3921c67..a2b1a21ee7fd 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -87,9 +87,7 @@ u32 ethtool_op_get_link(struct net_device *dev);
 int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *eti);
 
 
-/**
- * struct ethtool_link_ext_state_info - link extended state and substate.
- */
+/* Link extended state and substate. */
 struct ethtool_link_ext_state_info {
 	enum ethtool_link_ext_state link_ext_state;
 	union {
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index cde753bb2093..dc87ba092891 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -579,9 +579,7 @@ struct ethtool_pauseparam {
 	__u32	tx_pause;
 };
 
-/**
- * enum ethtool_link_ext_state - link extended state
- */
+/* Link extended state */
 enum ethtool_link_ext_state {
 	ETHTOOL_LINK_EXT_STATE_AUTONEG,
 	ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE,
@@ -595,10 +593,7 @@ enum ethtool_link_ext_state {
 	ETHTOOL_LINK_EXT_STATE_OVERHEAT,
 };
 
-/**
- * enum ethtool_link_ext_substate_autoneg - more information in addition to
- * ETHTOOL_LINK_EXT_STATE_AUTONEG.
- */
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */
 enum ethtool_link_ext_substate_autoneg {
 	ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1,
 	ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED,
@@ -608,9 +603,7 @@ enum ethtool_link_ext_substate_autoneg {
 	ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD,
 };
 
-/**
- * enum ethtool_link_ext_substate_link_training - more information in addition to
- * ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE.
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE.
  */
 enum ethtool_link_ext_substate_link_training {
 	ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1,
@@ -619,9 +612,7 @@ enum ethtool_link_ext_substate_link_training {
 	ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT,
 };
 
-/**
- * enum ethtool_link_ext_substate_logical_mismatch - more information in addition
- * to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH.
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH.
  */
 enum ethtool_link_ext_substate_link_logical_mismatch {
 	ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1,
@@ -631,19 +622,14 @@ enum ethtool_link_ext_substate_link_logical_mismatch {
 	ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED,
 };
 
-/**
- * enum ethtool_link_ext_substate_bad_signal_integrity - more information in
- * addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY.
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY.
  */
 enum ethtool_link_ext_substate_bad_signal_integrity {
 	ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1,
 	ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE,
 };
 
-/**
- * enum ethtool_link_ext_substate_cable_issue - more information in
- * addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE.
- */
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */
 enum ethtool_link_ext_substate_cable_issue {
 	ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1,
 	ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE,
-- 
cgit v1.2.3-71-gd317


From 83e5feeb385e6d69ae95dd30e0c424afaa17cc6b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 6 Apr 2021 17:28:26 -0700
Subject: ethtool: document reserved fields in the uAPI

Add a note on expected handling of reserved fields,
and references to all kdocs. This fixes a bunch
of kdoc warnings.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index dc87ba092891..c9c18e88c215 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -26,6 +26,14 @@
  * have the same layout for 32-bit and 64-bit userland.
  */
 
+/* Note on reserved space.
+ * Reserved fields must not be accessed directly by user space because
+ * they may be replaced by a different field in the future. They must
+ * be initialized to zero before making the request, e.g. via memset
+ * of the entire structure or implicitly by not being set in a structure
+ * initializer.
+ */
+
 /**
  * struct ethtool_cmd - DEPRECATED, link control and status
  * This structure is DEPRECATED, please use struct ethtool_link_settings.
@@ -67,6 +75,7 @@
  *	and other link features that the link partner advertised
  *	through autonegotiation; 0 if unknown or not applicable.
  *	Read-only.
+ * @reserved: Reserved for future use; see the note on reserved space.
  *
  * The link speed in Mbps is split between @speed and @speed_hi.  Use
  * the ethtool_cmd_speed() and ethtool_cmd_speed_set() functions to
@@ -155,6 +164,7 @@ static inline __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep)
  * @bus_info: Device bus address.  This should match the dev_name()
  *	string for the underlying bus device, if there is one.  May be
  *	an empty string.
+ * @reserved2: Reserved for future use; see the note on reserved space.
  * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and
  *	%ETHTOOL_SPFLAGS commands; also the number of strings in the
  *	%ETH_SS_PRIV_FLAGS set
@@ -356,6 +366,7 @@ struct ethtool_eeprom {
  * @tx_lpi_timer: Time in microseconds the interface delays prior to asserting
  *	its tx lpi (after reaching 'idle' state). Effective only when eee
  *	was negotiated and tx_lpi_enabled was set.
+ * @reserved: Reserved for future use; see the note on reserved space.
  */
 struct ethtool_eee {
 	__u32	cmd;
@@ -374,6 +385,7 @@ struct ethtool_eee {
  * @cmd: %ETHTOOL_GMODULEINFO
  * @type: Standard the module information conforms to %ETH_MODULE_SFF_xxxx
  * @eeprom_len: Length of the eeprom
+ * @reserved: Reserved for future use; see the note on reserved space.
  *
  * This structure is used to return the information to
  * properly size memory for a subsequent call to %ETHTOOL_GMODULEEEPROM.
@@ -701,6 +713,7 @@ struct ethtool_gstrings {
 /**
  * struct ethtool_sset_info - string set information
  * @cmd: Command number = %ETHTOOL_GSSET_INFO
+ * @reserved: Reserved for future use; see the note on reserved space.
  * @sset_mask: On entry, a bitmask of string sets to query, with bits
  *	numbered according to &enum ethtool_stringset.  On return, a
  *	bitmask of those string sets queried that are supported.
@@ -745,6 +758,7 @@ enum ethtool_test_flags {
  * @flags: A bitmask of flags from &enum ethtool_test_flags.  Some
  *	flags may be set by the user on entry; others may be set by
  *	the driver on return.
+ * @reserved: Reserved for future use; see the note on reserved space.
  * @len: On return, the number of test results
  * @data: Array of test results
  *
@@ -945,6 +959,7 @@ union ethtool_flow_union {
  * @vlan_etype: VLAN EtherType
  * @vlan_tci: VLAN tag control information
  * @data: user defined data
+ * @padding: Reserved for future use; see the note on reserved space.
  *
  * Note, @vlan_etype, @vlan_tci, and @data are only valid if %FLOW_EXT
  * is set in &struct ethtool_rx_flow_spec @flow_type.
@@ -1120,7 +1135,8 @@ struct ethtool_rxfh_indir {
  *	hardware hash key.
  * @hfunc: Defines the current RSS hash function used by HW (or to be set to).
  *	Valid values are one of the %ETH_RSS_HASH_*.
- * @rsvd:	Reserved for future extensions.
+ * @rsvd8: Reserved for future use; see the note on reserved space.
+ * @rsvd32: Reserved for future use; see the note on reserved space.
  * @rss_config: RX ring/queue index for each hash value i.e., indirection table
  *	of @indir_size __u32 elements, followed by hash key of @key_size
  *	bytes.
@@ -1288,7 +1304,9 @@ struct ethtool_sfeatures {
  * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags
  * @phc_index: device index of the associated PHC, or -1 if there is none
  * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values
+ * @tx_reserved: Reserved for future use; see the note on reserved space.
  * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values
+ * @rx_reserved: Reserved for future use; see the note on reserved space.
  *
  * The bits in the 'tx_types' and 'rx_filters' fields correspond to
  * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values,
@@ -1944,6 +1962,8 @@ enum ethtool_reset_flags {
  *	autonegotiation; 0 if unknown or not applicable.  Read-only.
  * @transceiver: Used to distinguish different possible PHY types,
  *	reported consistently by PHYLIB.  Read-only.
+ * @reserved: Reserved for future use; see the note on reserved space.
+ * @reserved1: Reserved for future use; see the note on reserved space.
  *
  * If autonegotiation is disabled, the speed and @duplex represent the
  * fixed link mode and are writable if the driver supports multiple
-- 
cgit v1.2.3-71-gd317


From d9c65de0c1e1574d2cc8007dbe02291fe47db1d9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 6 Apr 2021 17:28:27 -0700
Subject: ethtool: fix kdoc in headers

Fix remaining issues with kdoc in the ethtool headers.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      | 9 +++++++--
 include/uapi/linux/ethtool.h | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index a2b1a21ee7fd..7c88dfff7420 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -290,6 +290,9 @@ struct ethtool_pause_stats {
  *	do not attach ext_substate attribute to netlink message). If link_ext_state
  *	and link_ext_substate are unknown, return -ENODATA. If not implemented,
  *	link_ext_state and link_ext_substate will not be sent to userspace.
+ * @get_eeprom_len: Read range of EEPROM addresses for validation of
+ *	@get_eeprom and @set_eeprom requests.
+ *	Returns 0 if device does not support EEPROM access.
  * @get_eeprom: Read data from the device EEPROM.
  *	Should fill in the magic field.  Don't need to check len for zero
  *	or wraparound.  Fill in the data argument with the eeprom values
@@ -382,6 +385,8 @@ struct ethtool_pause_stats {
  * @get_module_eeprom: Get the eeprom information from the plug-in module
  * @get_eee: Get Energy-Efficient (EEE) supported and status.
  * @set_eee: Set EEE status (enable/disable) as well as LPI timers.
+ * @get_tunable: Read the value of a driver / device tunable.
+ * @set_tunable: Set the value of a driver / device tunable.
  * @get_per_queue_coalesce: Get interrupt coalescing parameters per queue.
  *	It must check that the given queue number is valid. If neither a RX nor
  *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
@@ -545,8 +550,8 @@ struct phy_tdr_config;
  * @get_sset_count: Get number of strings that @get_strings will write.
  * @get_strings: Return a set of strings that describe the requested objects
  * @get_stats: Return extended statistics about the PHY device.
- * @start_cable_test - Start a cable test
- * @start_cable_test_tdr - Start a Time Domain Reflectometry cable test
+ * @start_cable_test: Start a cable test
+ * @start_cable_test_tdr: Start a Time Domain Reflectometry cable test
  *
  * All operations are optional (i.e. the function pointer may be set to %NULL)
  * and callers must take this into account. Callers must hold the RTNL lock.
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index c9c18e88c215..5afea692a3f7 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -659,6 +659,7 @@ enum ethtool_link_ext_substate_cable_issue {
  *	now deprecated
  * @ETH_SS_FEATURES: Device feature names
  * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names
+ * @ETH_SS_TUNABLES: tunable names
  * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS
  * @ETH_SS_PHY_TUNABLES: PHY tunable names
  * @ETH_SS_LINK_MODES: link mode names
@@ -668,6 +669,8 @@ enum ethtool_link_ext_substate_cable_issue {
  * @ETH_SS_TS_TX_TYPES: timestamping Tx types
  * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters
  * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types
+ *
+ * @ETH_SS_COUNT: number of defined string sets
  */
 enum ethtool_stringset {
 	ETH_SS_TEST		= 0,
@@ -1962,8 +1965,11 @@ enum ethtool_reset_flags {
  *	autonegotiation; 0 if unknown or not applicable.  Read-only.
  * @transceiver: Used to distinguish different possible PHY types,
  *	reported consistently by PHYLIB.  Read-only.
+ * @master_slave_cfg: Master/slave port mode.
+ * @master_slave_state: Master/slave port state.
  * @reserved: Reserved for future use; see the note on reserved space.
  * @reserved1: Reserved for future use; see the note on reserved space.
+ * @link_mode_masks: Variable length bitmaps.
  *
  * If autonegotiation is disabled, the speed and @duplex represent the
  * fixed link mode and are writable if the driver supports multiple
-- 
cgit v1.2.3-71-gd317


From 71826654ce40112f0651b6f4e94c422354f4adb6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 19 Mar 2021 23:25:11 +0100
Subject: rfkill: revert back to old userspace API by default

Recompiling with the new extended version of struct rfkill_event
broke systemd in *two* ways:
 - It used "sizeof(struct rfkill_event)" to read the event, but
   then complained if it actually got something != 8, this broke
   it on new kernels (that include the updated API);
 - It used sizeof(struct rfkill_event) to write a command, but
   didn't implement the intended expansion protocol where the
   kernel returns only how many bytes it accepted, and errored
   out due to the unexpected smaller size on kernels that didn't
   include the updated API.

Even though systemd has now been fixed, that fix may not be always
deployed, and other applications could potentially have similar
issues.

As such, in the interest of avoiding regressions, revert the
default API "struct rfkill_event" back to the original size.

Instead, add a new "struct rfkill_event_ext" that extends it by
the new field, and even more clearly document that applications
should be prepared for extensions in two ways:
 * write might only accept fewer bytes on older kernels, and
   will return how many to let userspace know which data may
   have been ignored;
 * read might return anything between 8 (the original size) and
   whatever size the application sized its buffer at, indicating
   how much event data was supported by the kernel.

Perhaps that will help avoid such issues in the future and we
won't have to come up with another version of the struct if we
ever need to extend it again.

Applications that want to take advantage of the new field will
have to be modified to use struct rfkill_event_ext instead now,
which comes with the danger of them having already been updated
to use it from 'struct rfkill_event', but I found no evidence
of that, and it's still relatively new.

Cc: stable@vger.kernel.org # 5.11
Reported-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com> # LLVM/Clang v12.0.0-r4 (x86-64)
Link: https://lore.kernel.org/r/20210319232510.f1a139cfdd9c.Ic5c7c9d1d28972059e132ea653a21a427c326678@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/rfkill.h | 80 ++++++++++++++++++++++++++++++++++++++-------
 net/rfkill/core.c           |  7 ++--
 2 files changed, 72 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h
index 03e8af87b364..9b77cfc42efa 100644
--- a/include/uapi/linux/rfkill.h
+++ b/include/uapi/linux/rfkill.h
@@ -86,34 +86,90 @@ enum rfkill_hard_block_reasons {
  * @op: operation code
  * @hard: hard state (0/1)
  * @soft: soft state (0/1)
+ *
+ * Structure used for userspace communication on /dev/rfkill,
+ * used for events from the kernel and control to the kernel.
+ */
+struct rfkill_event {
+	__u32 idx;
+	__u8  type;
+	__u8  op;
+	__u8  soft;
+	__u8  hard;
+} __attribute__((packed));
+
+/**
+ * struct rfkill_event_ext - events for userspace on /dev/rfkill
+ * @idx: index of dev rfkill
+ * @type: type of the rfkill struct
+ * @op: operation code
+ * @hard: hard state (0/1)
+ * @soft: soft state (0/1)
  * @hard_block_reasons: valid if hard is set. One or several reasons from
  *	&enum rfkill_hard_block_reasons.
  *
  * Structure used for userspace communication on /dev/rfkill,
  * used for events from the kernel and control to the kernel.
+ *
+ * See the extensibility docs below.
  */
-struct rfkill_event {
+struct rfkill_event_ext {
 	__u32 idx;
 	__u8  type;
 	__u8  op;
 	__u8  soft;
 	__u8  hard;
+
+	/*
+	 * older kernels will accept/send only up to this point,
+	 * and if extended further up to any chunk marked below
+	 */
+
 	__u8  hard_block_reasons;
 } __attribute__((packed));
 
-/*
- * We are planning to be backward and forward compatible with changes
- * to the event struct, by adding new, optional, members at the end.
- * When reading an event (whether the kernel from userspace or vice
- * versa) we need to accept anything that's at least as large as the
- * version 1 event size, but might be able to accept other sizes in
- * the future.
+/**
+ * DOC: Extensibility
+ *
+ * Originally, we had planned to allow backward and forward compatible
+ * changes by just adding fields at the end of the structure that are
+ * then not reported on older kernels on read(), and not written to by
+ * older kernels on write(), with the kernel reporting the size it did
+ * accept as the result.
+ *
+ * This would have allowed userspace to detect on read() and write()
+ * which kernel structure version it was dealing with, and if was just
+ * recompiled it would have gotten the new fields, but obviously not
+ * accessed them, but things should've continued to work.
+ *
+ * Unfortunately, while actually exercising this mechanism to add the
+ * hard block reasons field, we found that userspace (notably systemd)
+ * did all kinds of fun things not in line with this scheme:
+ *
+ * 1. treat the (expected) short writes as an error;
+ * 2. ask to read sizeof(struct rfkill_event) but then compare the
+ *    actual return value to RFKILL_EVENT_SIZE_V1 and treat any
+ *    mismatch as an error.
+ *
+ * As a consequence, just recompiling with a new struct version caused
+ * things to no longer work correctly on old and new kernels.
+ *
+ * Hence, we've rolled back &struct rfkill_event to the original version
+ * and added &struct rfkill_event_ext. This effectively reverts to the
+ * old behaviour for all userspace, unless it explicitly opts in to the
+ * rules outlined here by using the new &struct rfkill_event_ext.
+ *
+ * Userspace using &struct rfkill_event_ext must adhere to the following
+ * rules
  *
- * One exception is the kernel -- we already have two event sizes in
- * that we've made the 'hard' member optional since our only option
- * is to ignore it anyway.
+ * 1. accept short writes, optionally using them to detect that it's
+ *    running on an older kernel;
+ * 2. accept short reads, knowing that this means it's running on an
+ *    older kernel;
+ * 3. treat reads that are as long as requested as acceptable, not
+ *    checking against RFKILL_EVENT_SIZE_V1 or such.
  */
-#define RFKILL_EVENT_SIZE_V1	8
+#define RFKILL_EVENT_SIZE_V1	sizeof(struct rfkill_event)
 
 /* ioctl for turning off rfkill-input (if present) */
 #define RFKILL_IOC_MAGIC	'R'
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 68d6ef9e59fc..ac15a944573f 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -69,7 +69,7 @@ struct rfkill {
 
 struct rfkill_int_event {
 	struct list_head	list;
-	struct rfkill_event	ev;
+	struct rfkill_event_ext	ev;
 };
 
 struct rfkill_data {
@@ -253,7 +253,8 @@ static void rfkill_global_led_trigger_unregister(void)
 }
 #endif /* CONFIG_RFKILL_LEDS */
 
-static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
+static void rfkill_fill_event(struct rfkill_event_ext *ev,
+			      struct rfkill *rfkill,
 			      enum rfkill_operation op)
 {
 	unsigned long flags;
@@ -1237,7 +1238,7 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *pos)
 {
 	struct rfkill *rfkill;
-	struct rfkill_event ev;
+	struct rfkill_event_ext ev;
 	int ret;
 
 	/* we don't need the 'hard' variable but accept it */
-- 
cgit v1.2.3-71-gd317


From 0750cfd8b7fd491c9d7c8bd19d9ac380cd3c84ee Mon Sep 17 00:00:00 2001
From: James Prestwood <prestwoj@gmail.com>
Date: Thu, 11 Mar 2021 15:03:33 -0800
Subject: nl80211: better document CMD_ROAM behavior

The docs were very sparse with how exactly CMD_ROAM should be
used. Specifically related to BSS information normally obtained
through a user space scan.

Signed-off-by: James Prestwood <prestwoj@gmail.com>
Link: https://lore.kernel.org/r/20210311230333.103934-1-prestwoj@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index ac78da99fccd..5e30c7f6c484 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -655,6 +655,9 @@
  *	When a security association was established on an 802.1X network using
  *	fast transition, this event should be followed by an
  *	%NL80211_CMD_PORT_AUTHORIZED event.
+ *	Following a %NL80211_CMD_ROAM event userspace can issue
+ *	%NL80211_CMD_GET_SCAN in order to obtain the scan information for the
+ *	new BSS the card/driver roamed to.
  * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify
  *	userspace that a connection was dropped by the AP or due to other
  *	reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and
-- 
cgit v1.2.3-71-gd317


From afd2daa26c7abd734d78bd274fc6c59a15e61063 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 6 Apr 2021 21:55:53 +0200
Subject: Bluetooth: Add support for virtio transport driver

This adds support for Bluetooth HCI transport over virtio.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/Kconfig       |  10 +
 drivers/bluetooth/Makefile      |   2 +
 drivers/bluetooth/virtio_bt.c   | 401 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/virtio_bt.h  |  31 ++++
 include/uapi/linux/virtio_ids.h |   1 +
 5 files changed, 445 insertions(+)
 create mode 100644 drivers/bluetooth/virtio_bt.c
 create mode 100644 include/uapi/linux/virtio_bt.h

(limited to 'include/uapi/linux')

diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig
index 4e73a531b377..851842372c9b 100644
--- a/drivers/bluetooth/Kconfig
+++ b/drivers/bluetooth/Kconfig
@@ -425,4 +425,14 @@ config BT_HCIRSI
 	  Say Y here to compile support for HCI over Redpine into the
 	  kernel or say M to compile as a module.
 
+config BT_VIRTIO
+	tristate "Virtio Bluetooth driver"
+	depends on VIRTIO
+	help
+	  Virtio Bluetooth support driver.
+	  This driver supports Virtio Bluetooth devices.
+
+	  Say Y here to compile support for HCI over Virtio into the
+	  kernel or say M to compile as a module.
+
 endmenu
diff --git a/drivers/bluetooth/Makefile b/drivers/bluetooth/Makefile
index 1a58a3ae142c..16286ea2655d 100644
--- a/drivers/bluetooth/Makefile
+++ b/drivers/bluetooth/Makefile
@@ -26,6 +26,8 @@ obj-$(CONFIG_BT_BCM)		+= btbcm.o
 obj-$(CONFIG_BT_RTL)		+= btrtl.o
 obj-$(CONFIG_BT_QCA)		+= btqca.o
 
+obj-$(CONFIG_BT_VIRTIO)		+= virtio_bt.o
+
 obj-$(CONFIG_BT_HCIUART_NOKIA)	+= hci_nokia.o
 
 obj-$(CONFIG_BT_HCIRSI)		+= btrsi.o
diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c
new file mode 100644
index 000000000000..c804db7e90f8
--- /dev/null
+++ b/drivers/bluetooth/virtio_bt.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/module.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/skbuff.h>
+
+#include <uapi/linux/virtio_ids.h>
+#include <uapi/linux/virtio_bt.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#define VERSION "0.1"
+
+enum {
+	VIRTBT_VQ_TX,
+	VIRTBT_VQ_RX,
+	VIRTBT_NUM_VQS,
+};
+
+struct virtio_bluetooth {
+	struct virtio_device *vdev;
+	struct virtqueue *vqs[VIRTBT_NUM_VQS];
+	struct work_struct rx;
+	struct hci_dev *hdev;
+};
+
+static int virtbt_add_inbuf(struct virtio_bluetooth *vbt)
+{
+	struct virtqueue *vq = vbt->vqs[VIRTBT_VQ_RX];
+	struct scatterlist sg[1];
+	struct sk_buff *skb;
+	int err;
+
+	skb = alloc_skb(1000, GFP_KERNEL);
+	sg_init_one(sg, skb->data, 1000);
+
+	err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_KERNEL);
+	if (err < 0) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	return 0;
+}
+
+static int virtbt_open(struct hci_dev *hdev)
+{
+	struct virtio_bluetooth *vbt = hci_get_drvdata(hdev);
+
+	if (virtbt_add_inbuf(vbt) < 0)
+		return -EIO;
+
+	virtqueue_kick(vbt->vqs[VIRTBT_VQ_RX]);
+	return 0;
+}
+
+static int virtbt_close(struct hci_dev *hdev)
+{
+	struct virtio_bluetooth *vbt = hci_get_drvdata(hdev);
+	int i;
+
+	cancel_work_sync(&vbt->rx);
+
+	for (i = 0; i < ARRAY_SIZE(vbt->vqs); i++) {
+		struct virtqueue *vq = vbt->vqs[i];
+		struct sk_buff *skb;
+
+		while ((skb = virtqueue_detach_unused_buf(vq)))
+			kfree_skb(skb);
+	}
+
+	return 0;
+}
+
+static int virtbt_flush(struct hci_dev *hdev)
+{
+	return 0;
+}
+
+static int virtbt_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct virtio_bluetooth *vbt = hci_get_drvdata(hdev);
+	struct scatterlist sg[1];
+	int err;
+
+	memcpy(skb_push(skb, 1), &hci_skb_pkt_type(skb), 1);
+
+	sg_init_one(sg, skb->data, skb->len);
+	err = virtqueue_add_outbuf(vbt->vqs[VIRTBT_VQ_TX], sg, 1, skb,
+				   GFP_KERNEL);
+	if (err) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	virtqueue_kick(vbt->vqs[VIRTBT_VQ_TX]);
+	return 0;
+}
+
+static int virtbt_setup_zephyr(struct hci_dev *hdev)
+{
+	struct sk_buff *skb;
+
+	/* Read Build Information */
+	skb = __hci_cmd_sync(hdev, 0xfc08, 0, NULL, HCI_INIT_TIMEOUT);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	bt_dev_info(hdev, "%s", (char *)(skb->data + 1));
+
+	hci_set_fw_info(hdev, "%s", skb->data + 1);
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static int virtbt_set_bdaddr_zephyr(struct hci_dev *hdev,
+				    const bdaddr_t *bdaddr)
+{
+	struct sk_buff *skb;
+
+	/* Write BD_ADDR */
+	skb = __hci_cmd_sync(hdev, 0xfc06, 6, bdaddr, HCI_INIT_TIMEOUT);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static int virtbt_setup_intel(struct hci_dev *hdev)
+{
+	struct sk_buff *skb;
+
+	/* Intel Read Version */
+	skb = __hci_cmd_sync(hdev, 0xfc05, 0, NULL, HCI_CMD_TIMEOUT);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static int virtbt_set_bdaddr_intel(struct hci_dev *hdev, const bdaddr_t *bdaddr)
+{
+	struct sk_buff *skb;
+
+	/* Intel Write BD Address */
+	skb = __hci_cmd_sync(hdev, 0xfc31, 6, bdaddr, HCI_INIT_TIMEOUT);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static int virtbt_setup_realtek(struct hci_dev *hdev)
+{
+	struct sk_buff *skb;
+
+	/* Read ROM Version */
+	skb = __hci_cmd_sync(hdev, 0xfc6d, 0, NULL, HCI_INIT_TIMEOUT);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	bt_dev_info(hdev, "ROM version %u", *((__u8 *) (skb->data + 1)));
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static int virtbt_shutdown_generic(struct hci_dev *hdev)
+{
+	struct sk_buff *skb;
+
+	/* Reset */
+	skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL, HCI_INIT_TIMEOUT);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static void virtbt_rx_handle(struct virtio_bluetooth *vbt, struct sk_buff *skb)
+{
+	__u8 pkt_type;
+
+	pkt_type = *((__u8 *) skb->data);
+	skb_pull(skb, 1);
+
+	switch (pkt_type) {
+	case HCI_EVENT_PKT:
+	case HCI_ACLDATA_PKT:
+	case HCI_SCODATA_PKT:
+	case HCI_ISODATA_PKT:
+		hci_skb_pkt_type(skb) = pkt_type;
+		hci_recv_frame(vbt->hdev, skb);
+		break;
+	}
+}
+
+static void virtbt_rx_work(struct work_struct *work)
+{
+	struct virtio_bluetooth *vbt = container_of(work,
+						    struct virtio_bluetooth, rx);
+	struct sk_buff *skb;
+	unsigned int len;
+
+	skb = virtqueue_get_buf(vbt->vqs[VIRTBT_VQ_RX], &len);
+	if (!skb)
+		return;
+
+	skb->len = len;
+	virtbt_rx_handle(vbt, skb);
+
+	if (virtbt_add_inbuf(vbt) < 0)
+		return;
+
+	virtqueue_kick(vbt->vqs[VIRTBT_VQ_RX]);
+}
+
+static void virtbt_tx_done(struct virtqueue *vq)
+{
+	struct sk_buff *skb;
+	unsigned int len;
+
+	while ((skb = virtqueue_get_buf(vq, &len)))
+		kfree_skb(skb);
+}
+
+static void virtbt_rx_done(struct virtqueue *vq)
+{
+	struct virtio_bluetooth *vbt = vq->vdev->priv;
+
+	schedule_work(&vbt->rx);
+}
+
+static int virtbt_probe(struct virtio_device *vdev)
+{
+	vq_callback_t *callbacks[VIRTBT_NUM_VQS] = {
+		[VIRTBT_VQ_TX] = virtbt_tx_done,
+		[VIRTBT_VQ_RX] = virtbt_rx_done,
+	};
+	const char *names[VIRTBT_NUM_VQS] = {
+		[VIRTBT_VQ_TX] = "tx",
+		[VIRTBT_VQ_RX] = "rx",
+	};
+	struct virtio_bluetooth *vbt;
+	struct hci_dev *hdev;
+	int err;
+	__u8 type;
+
+	if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
+		return -ENODEV;
+
+	type = virtio_cread8(vdev, offsetof(struct virtio_bt_config, type));
+
+	switch (type) {
+	case VIRTIO_BT_CONFIG_TYPE_PRIMARY:
+	case VIRTIO_BT_CONFIG_TYPE_AMP:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	vbt = kzalloc(sizeof(*vbt), GFP_KERNEL);
+	if (!vbt)
+		return -ENOMEM;
+
+	vdev->priv = vbt;
+	vbt->vdev = vdev;
+
+	INIT_WORK(&vbt->rx, virtbt_rx_work);
+
+	err = virtio_find_vqs(vdev, VIRTBT_NUM_VQS, vbt->vqs, callbacks,
+			      names, NULL);
+	if (err)
+		return err;
+
+	hdev = hci_alloc_dev();
+	if (!hdev) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	vbt->hdev = hdev;
+
+	hdev->bus = HCI_VIRTIO;
+	hdev->dev_type = type;
+	hci_set_drvdata(hdev, vbt);
+
+	hdev->open  = virtbt_open;
+	hdev->close = virtbt_close;
+	hdev->flush = virtbt_flush;
+	hdev->send  = virtbt_send_frame;
+
+	if (virtio_has_feature(vdev, VIRTIO_BT_F_VND_HCI)) {
+		__u16 vendor;
+
+		virtio_cread(vdev, struct virtio_bt_config, vendor, &vendor);
+
+		switch (vendor) {
+		case VIRTIO_BT_CONFIG_VENDOR_ZEPHYR:
+			hdev->manufacturer = 1521;
+			hdev->setup = virtbt_setup_zephyr;
+			hdev->shutdown = virtbt_shutdown_generic;
+			hdev->set_bdaddr = virtbt_set_bdaddr_zephyr;
+			break;
+
+		case VIRTIO_BT_CONFIG_VENDOR_INTEL:
+			hdev->manufacturer = 2;
+			hdev->setup = virtbt_setup_intel;
+			hdev->shutdown = virtbt_shutdown_generic;
+			hdev->set_bdaddr = virtbt_set_bdaddr_intel;
+			set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks);
+			set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
+			set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+			break;
+
+		case VIRTIO_BT_CONFIG_VENDOR_REALTEK:
+			hdev->manufacturer = 93;
+			hdev->setup = virtbt_setup_realtek;
+			hdev->shutdown = virtbt_shutdown_generic;
+			set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
+			set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+			break;
+		}
+	}
+
+	if (virtio_has_feature(vdev, VIRTIO_BT_F_MSFT_EXT)) {
+		__u16 msft_opcode;
+
+		virtio_cread(vdev, struct virtio_bt_config,
+			     msft_opcode, &msft_opcode);
+
+		hci_set_msft_opcode(hdev, msft_opcode);
+	}
+
+	if (virtio_has_feature(vdev, VIRTIO_BT_F_AOSP_EXT))
+		hci_set_aosp_capable(hdev);
+
+	if (hci_register_dev(hdev) < 0) {
+		hci_free_dev(hdev);
+		err = -EBUSY;
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	vdev->config->del_vqs(vdev);
+	return err;
+}
+
+static void virtbt_remove(struct virtio_device *vdev)
+{
+	struct virtio_bluetooth *vbt = vdev->priv;
+	struct hci_dev *hdev = vbt->hdev;
+
+	hci_unregister_dev(hdev);
+	vdev->config->reset(vdev);
+
+	hci_free_dev(hdev);
+	vbt->hdev = NULL;
+
+	vdev->config->del_vqs(vdev);
+	kfree(vbt);
+}
+
+static struct virtio_device_id virtbt_table[] = {
+	{ VIRTIO_ID_BT, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+MODULE_DEVICE_TABLE(virtio, virtbt_table);
+
+static const unsigned int virtbt_features[] = {
+	VIRTIO_BT_F_VND_HCI,
+	VIRTIO_BT_F_MSFT_EXT,
+	VIRTIO_BT_F_AOSP_EXT,
+};
+
+static struct virtio_driver virtbt_driver = {
+	.driver.name         = KBUILD_MODNAME,
+	.driver.owner        = THIS_MODULE,
+	.feature_table       = virtbt_features,
+	.feature_table_size  = ARRAY_SIZE(virtbt_features),
+	.id_table            = virtbt_table,
+	.probe               = virtbt_probe,
+	.remove              = virtbt_remove,
+};
+
+module_virtio_driver(virtbt_driver);
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Generic Bluetooth VIRTIO driver ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/virtio_bt.h b/include/uapi/linux/virtio_bt.h
new file mode 100644
index 000000000000..a7bd48daa9a9
--- /dev/null
+++ b/include/uapi/linux/virtio_bt.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+
+#ifndef _UAPI_LINUX_VIRTIO_BT_H
+#define _UAPI_LINUX_VIRTIO_BT_H
+
+#include <linux/virtio_types.h>
+
+/* Feature bits */
+#define VIRTIO_BT_F_VND_HCI	0	/* Indicates vendor command support */
+#define VIRTIO_BT_F_MSFT_EXT	1	/* Indicates MSFT vendor support */
+#define VIRTIO_BT_F_AOSP_EXT	2	/* Indicates AOSP vendor support */
+
+enum virtio_bt_config_type {
+	VIRTIO_BT_CONFIG_TYPE_PRIMARY	= 0,
+	VIRTIO_BT_CONFIG_TYPE_AMP	= 1,
+};
+
+enum virtio_bt_config_vendor {
+	VIRTIO_BT_CONFIG_VENDOR_NONE	= 0,
+	VIRTIO_BT_CONFIG_VENDOR_ZEPHYR	= 1,
+	VIRTIO_BT_CONFIG_VENDOR_INTEL	= 2,
+	VIRTIO_BT_CONFIG_VENDOR_REALTEK	= 3,
+};
+
+struct virtio_bt_config {
+	__u8  type;
+	__u16 vendor;
+	__u16 msft_opcode;
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_VIRTIO_BT_H */
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index bc1c0621f5ed..b4f468e9441d 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -53,6 +53,7 @@
 #define VIRTIO_ID_MEM			24 /* virtio mem */
 #define VIRTIO_ID_FS			26 /* virtio filesystem */
 #define VIRTIO_ID_PMEM			27 /* virtio pmem */
+#define VIRTIO_ID_BT			28 /* virtio bluetooth */
 #define VIRTIO_ID_MAC80211_HWSIM	29 /* virtio mac80211-hwsim */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
-- 
cgit v1.2.3-71-gd317


From d84d13d6f6e03a7aaac9e7d064a11cf6f5087da6 Mon Sep 17 00:00:00 2001
From: Vamsi Krishna <vamsin@codeaurora.org>
Date: Tue, 2 Mar 2021 20:20:36 +0530
Subject: nl80211: Add interface to indicate TDLS peer's HE capability

Enhance enum nl80211_tdls_peer_capability to configure TDLS peer's
support for HE mode. Userspace decodes the TDLS setup response frame
and confugures the HE mode support to driver if the peer has advertized
HE mode support in TDLS setup response frame. The driver uses this
information to decide whether to include HE operation IE in TDLS setup
confirmation frame.

Signed-off-by: Vamsi Krishna <vamsin@codeaurora.org>
Link: https://lore.kernel.org/r/1614696636-30144-1-git-send-email-vamsin@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5e30c7f6c484..18dfe744bcb5 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6298,11 +6298,13 @@ struct nl80211_vendor_cmd_info {
  * @NL80211_TDLS_PEER_HT: TDLS peer is HT capable.
  * @NL80211_TDLS_PEER_VHT: TDLS peer is VHT capable.
  * @NL80211_TDLS_PEER_WMM: TDLS peer is WMM capable.
+ * @NL80211_TDLS_PEER_HE: TDLS peer is HE capable.
  */
 enum nl80211_tdls_peer_capability {
 	NL80211_TDLS_PEER_HT = 1<<0,
 	NL80211_TDLS_PEER_VHT = 1<<1,
 	NL80211_TDLS_PEER_WMM = 1<<2,
+	NL80211_TDLS_PEER_HE = 1<<3,
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From 53f111cbfac6f2000c604994ddf01d3299d7de6b Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Wed, 7 Apr 2021 22:22:51 +0200
Subject: net: phy: add constants for 2.5G and 5G speed in PCS speed register
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add constants for 2.5G and 5G speed in PCS speed register into mdio.h.

Signed-off-by: Marek Behún <kabel@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mdio.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index 3f302e2523b2..bdf77dffa5a4 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -120,6 +120,8 @@
 #define MDIO_PMA_SPEED_100		0x0020	/* 100M capable */
 #define MDIO_PMA_SPEED_10		0x0040	/* 10M capable */
 #define MDIO_PCS_SPEED_10P2B		0x0002	/* 10PASS-TS/2BASE-TL capable */
+#define MDIO_PCS_SPEED_2_5G		0x0040	/* 2.5G capable */
+#define MDIO_PCS_SPEED_5G		0x0080	/* 5G capable */
 
 /* Device present registers. */
 #define MDIO_DEVS_PRESENT(devad)	(1 << (devad))
-- 
cgit v1.2.3-71-gd317


From d09845e98a05850a8094ea8fd6dd09a8e6824fff Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 7 Apr 2021 11:52:01 +0200
Subject: tty: actually undefine superseded ASYNC flags

Some kernel-internal ASYNC flags have been superseded by tty-port flags
and should no longer be used by kernel drivers.

Fix the misspelled "__KERNEL__" compile guards which failed their sole
purpose to break out-of-tree drivers that have not yet been updated.

Fixes: 5c0517fefc92 ("tty: core: Undefine ASYNC_* flags superceded by TTY_PORT* flags")
Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20210407095208.31838-2-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/tty_flags.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tty_flags.h b/include/uapi/linux/tty_flags.h
index 900a32e63424..6a3ac496a56c 100644
--- a/include/uapi/linux/tty_flags.h
+++ b/include/uapi/linux/tty_flags.h
@@ -39,7 +39,7 @@
  * WARNING: These flags are no longer used and have been superceded by the
  *	    TTY_PORT_ flags in the iflags field (and not userspace-visible)
  */
-#ifndef _KERNEL_
+#ifndef __KERNEL__
 #define ASYNCB_INITIALIZED	31 /* Serial port was initialized */
 #define ASYNCB_SUSPENDED	30 /* Serial port is suspended */
 #define ASYNCB_NORMAL_ACTIVE	29 /* Normal device is active */
@@ -81,7 +81,7 @@
 #define ASYNC_SPD_WARP		(ASYNC_SPD_HI|ASYNC_SPD_SHI)
 #define ASYNC_SPD_MASK		(ASYNC_SPD_HI|ASYNC_SPD_VHI|ASYNC_SPD_SHI)
 
-#ifndef _KERNEL_
+#ifndef __KERNEL__
 /* These flags are no longer used (and were always masked from userspace) */
 #define ASYNC_INITIALIZED	(1U << ASYNCB_INITIALIZED)
 #define ASYNC_NORMAL_ACTIVE	(1U << ASYNCB_NORMAL_ACTIVE)
-- 
cgit v1.2.3-71-gd317


From 5a35b040d0567f9dce6e801e6e3b575b9c463028 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 7 Apr 2021 11:52:06 +0200
Subject: tty: add ASYNC_SPLIT_TERMIOS to deprecation mask

Callout devices are long-gone, but the ASYNC_SPLIT_TERMIOS flag was
never added to the deprecation mask.

Add it so that a warning is printed if it is ever used.

Fixes: 8a8ae62f8296 ("tty: warn on deprecated serial flags")
Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20210407095208.31838-7-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/tty_flags.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tty_flags.h b/include/uapi/linux/tty_flags.h
index 6a3ac496a56c..cf25056d4b27 100644
--- a/include/uapi/linux/tty_flags.h
+++ b/include/uapi/linux/tty_flags.h
@@ -73,8 +73,8 @@
 #define ASYNC_MAGIC_MULTIPLIER	(1U << ASYNCB_MAGIC_MULTIPLIER)
 
 #define ASYNC_FLAGS		((1U << (ASYNCB_LAST_USER + 1)) - 1)
-#define ASYNC_DEPRECATED	(ASYNC_SESSION_LOCKOUT | ASYNC_PGRP_LOCKOUT | \
-		ASYNC_CALLOUT_NOHUP | ASYNC_AUTOPROBE)
+#define ASYNC_DEPRECATED	(ASYNC_SPLIT_TERMIOS | ASYNC_SESSION_LOCKOUT | \
+		ASYNC_PGRP_LOCKOUT | ASYNC_CALLOUT_NOHUP | ASYNC_AUTOPROBE)
 #define ASYNC_USR_MASK		(ASYNC_SPD_MASK|ASYNC_CALLOUT_NOHUP| \
 		ASYNC_LOW_LATENCY)
 #define ASYNC_SPD_CUST		(ASYNC_SPD_HI|ASYNC_SPD_VHI)
-- 
cgit v1.2.3-71-gd317


From a7dc1e6f99df59799ab0128d9c4e47bbeceb934d Mon Sep 17 00:00:00 2001
From: Hang Lu <hangl@codeaurora.org>
Date: Fri, 9 Apr 2021 17:40:46 +0800
Subject: binder: tell userspace to dump current backtrace when detected oneway
 spamming

When async binder buffer got exhausted, some normal oneway transactions
will also be discarded and may cause system or application failures. By
that time, the binder debug information we dump may not be relevant to
the root cause. And this issue is difficult to debug if without the
backtrace of the thread sending spam.

This change will send BR_ONEWAY_SPAM_SUSPECT to userspace when oneway
spamming is detected, request to dump current backtrace. Oneway spamming
will be reported only once when exceeding the threshold (target process
dips below 80% of its oneway space, and current process is responsible for
either more than 50 transactions, or more than 50% of the oneway space).
And the detection will restart when the async buffer has returned to a
healthy state.

Acked-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Hang Lu <hangl@codeaurora.org>
Link: https://lore.kernel.org/r/1617961246-4502-3-git-send-email-hangl@codeaurora.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 27 ++++++++++++++++++++++++---
 drivers/android/binder_alloc.c      | 15 ++++++++++++---
 drivers/android/binder_alloc.h      |  8 +++++++-
 drivers/android/binder_internal.h   |  6 +++++-
 include/uapi/linux/android/binder.h |  8 ++++++++
 5 files changed, 56 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index be34da3dd077..63d2c4339689 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -3020,7 +3020,10 @@ static void binder_transaction(struct binder_proc *proc,
 			goto err_bad_object_type;
 		}
 	}
-	tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
+	if (t->buffer->oneway_spam_suspect)
+		tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT;
+	else
+		tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
 	t->work.type = BINDER_WORK_TRANSACTION;
 
 	if (reply) {
@@ -3893,9 +3896,14 @@ retry:
 
 			binder_stat_br(proc, thread, cmd);
 		} break;
-		case BINDER_WORK_TRANSACTION_COMPLETE: {
+		case BINDER_WORK_TRANSACTION_COMPLETE:
+		case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: {
+			if (proc->oneway_spam_detection_enabled &&
+				   w->type == BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT)
+				cmd = BR_ONEWAY_SPAM_SUSPECT;
+			else
+				cmd = BR_TRANSACTION_COMPLETE;
 			binder_inner_proc_unlock(proc);
-			cmd = BR_TRANSACTION_COMPLETE;
 			kfree(w);
 			binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
 			if (put_user(cmd, (uint32_t __user *)ptr))
@@ -4897,6 +4905,18 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		break;
 	}
+	case BINDER_ENABLE_ONEWAY_SPAM_DETECTION: {
+		uint32_t enable;
+
+		if (copy_from_user(&enable, ubuf, sizeof(enable))) {
+			ret = -EINVAL;
+			goto err;
+		}
+		binder_inner_proc_lock(proc);
+		proc->oneway_spam_detection_enabled = (bool)enable;
+		binder_inner_proc_unlock(proc);
+		break;
+	}
 	default:
 		ret = -EINVAL;
 		goto err;
@@ -5561,6 +5581,7 @@ static const char * const binder_return_strings[] = {
 	"BR_CLEAR_DEATH_NOTIFICATION_DONE",
 	"BR_FAILED_REPLY",
 	"BR_FROZEN_REPLY",
+	"BR_ONEWAY_SPAM_SUSPECT",
 };
 
 static const char * const binder_command_strings[] = {
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 7caf74ad2405..340515f54498 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -338,7 +338,7 @@ static inline struct vm_area_struct *binder_alloc_get_vma(
 	return vma;
 }
 
-static void debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
+static bool debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
 {
 	/*
 	 * Find the amount and size of buffers allocated by the current caller;
@@ -366,13 +366,19 @@ static void debug_low_async_space_locked(struct binder_alloc *alloc, int pid)
 
 	/*
 	 * Warn if this pid has more than 50 transactions, or more than 50% of
-	 * async space (which is 25% of total buffer size).
+	 * async space (which is 25% of total buffer size). Oneway spam is only
+	 * detected when the threshold is exceeded.
 	 */
 	if (num_buffers > 50 || total_alloc_size > alloc->buffer_size / 4) {
 		binder_alloc_debug(BINDER_DEBUG_USER_ERROR,
 			     "%d: pid %d spamming oneway? %zd buffers allocated for a total size of %zd\n",
 			      alloc->pid, pid, num_buffers, total_alloc_size);
+		if (!alloc->oneway_spam_detected) {
+			alloc->oneway_spam_detected = true;
+			return true;
+		}
 	}
+	return false;
 }
 
 static struct binder_buffer *binder_alloc_new_buf_locked(
@@ -525,6 +531,7 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
 	buffer->async_transaction = is_async;
 	buffer->extra_buffers_size = extra_buffers_size;
 	buffer->pid = pid;
+	buffer->oneway_spam_suspect = false;
 	if (is_async) {
 		alloc->free_async_space -= size + sizeof(struct binder_buffer);
 		binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
@@ -536,7 +543,9 @@ static struct binder_buffer *binder_alloc_new_buf_locked(
 			 * of async space left (which is less than 10% of total
 			 * buffer size).
 			 */
-			debug_low_async_space_locked(alloc, pid);
+			buffer->oneway_spam_suspect = debug_low_async_space_locked(alloc, pid);
+		} else {
+			alloc->oneway_spam_detected = false;
 		}
 	}
 	return buffer;
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index 6e8e001381af..7dea57a84c79 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -26,6 +26,8 @@ struct binder_transaction;
  * @clear_on_free:      %true if buffer must be zeroed after use
  * @allow_user_free:    %true if user is allowed to free buffer
  * @async_transaction:  %true if buffer is in use for an async txn
+ * @oneway_spam_suspect: %true if total async allocate size just exceed
+ * spamming detect threshold
  * @debug_id:           unique ID for debugging
  * @transaction:        pointer to associated struct binder_transaction
  * @target_node:        struct binder_node associated with this buffer
@@ -45,7 +47,8 @@ struct binder_buffer {
 	unsigned clear_on_free:1;
 	unsigned allow_user_free:1;
 	unsigned async_transaction:1;
-	unsigned debug_id:28;
+	unsigned oneway_spam_suspect:1;
+	unsigned debug_id:27;
 
 	struct binder_transaction *transaction;
 
@@ -87,6 +90,8 @@ struct binder_lru_page {
  * @buffer_size:        size of address space specified via mmap
  * @pid:                pid for associated binder_proc (invariant after init)
  * @pages_high:         high watermark of offset in @pages
+ * @oneway_spam_detected: %true if oneway spam detection fired, clear that
+ * flag once the async buffer has returned to a healthy state
  *
  * Bookkeeping structure for per-proc address space management for binder
  * buffers. It is normally initialized during binder_init() and binder_mmap()
@@ -107,6 +112,7 @@ struct binder_alloc {
 	uint32_t buffer_free;
 	int pid;
 	size_t pages_high;
+	bool oneway_spam_detected;
 };
 
 #ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
diff --git a/drivers/android/binder_internal.h b/drivers/android/binder_internal.h
index a50716696fad..810c0b84d3f8 100644
--- a/drivers/android/binder_internal.h
+++ b/drivers/android/binder_internal.h
@@ -155,7 +155,7 @@ enum binder_stat_types {
 };
 
 struct binder_stats {
-	atomic_t br[_IOC_NR(BR_FROZEN_REPLY) + 1];
+	atomic_t br[_IOC_NR(BR_ONEWAY_SPAM_SUSPECT) + 1];
 	atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1];
 	atomic_t obj_created[BINDER_STAT_COUNT];
 	atomic_t obj_deleted[BINDER_STAT_COUNT];
@@ -174,6 +174,7 @@ struct binder_work {
 	enum binder_work_type {
 		BINDER_WORK_TRANSACTION = 1,
 		BINDER_WORK_TRANSACTION_COMPLETE,
+		BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT,
 		BINDER_WORK_RETURN_ERROR,
 		BINDER_WORK_NODE,
 		BINDER_WORK_DEAD_BINDER,
@@ -409,6 +410,8 @@ struct binder_ref {
  * @outer_lock:           no nesting under innor or node lock
  *                        Lock order: 1) outer, 2) node, 3) inner
  * @binderfs_entry:       process-specific binderfs log file
+ * @oneway_spam_detection_enabled: process enabled oneway spam detection
+ *                        or not
  *
  * Bookkeeping structure for binder processes
  */
@@ -444,6 +447,7 @@ struct binder_proc {
 	spinlock_t inner_lock;
 	spinlock_t outer_lock;
 	struct dentry *binderfs_entry;
+	bool oneway_spam_detection_enabled;
 };
 
 /**
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 156070d18c4f..20e435fe657a 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -241,6 +241,7 @@ struct binder_frozen_status_info {
 #define BINDER_SET_CONTEXT_MGR_EXT	_IOW('b', 13, struct flat_binder_object)
 #define BINDER_FREEZE			_IOW('b', 14, struct binder_freeze_info)
 #define BINDER_GET_FROZEN_INFO		_IOWR('b', 15, struct binder_frozen_status_info)
+#define BINDER_ENABLE_ONEWAY_SPAM_DETECTION	_IOW('b', 16, __u32)
 
 /*
  * NOTE: Two special error codes you should check for when calling
@@ -428,6 +429,13 @@ enum binder_driver_return_protocol {
 	 * The target of the last transaction (either a bcTRANSACTION or
 	 * a bcATTEMPT_ACQUIRE) is frozen.  No parameters.
 	 */
+
+	BR_ONEWAY_SPAM_SUSPECT = _IO('r', 19),
+	/*
+	 * Current process sent too many oneway calls to target, and the last
+	 * asynchronous transaction makes the allocated async buffer size exceed
+	 * detection threshold.  No parameters.
+	 */
 };
 
 enum binder_driver_command_protocol {
-- 
cgit v1.2.3-71-gd317


From c781ff12a2f37a9795e13bf328e5053d3e69f9e0 Mon Sep 17 00:00:00 2001
From: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Date: Fri, 9 Apr 2021 11:06:34 +0300
Subject: ethtool: Allow network drivers to dump arbitrary EEPROM data

Define get_module_eeprom_by_page() ethtool callback and implement
netlink infrastructure.

get_module_eeprom_by_page() allows network drivers to dump a part of
module's EEPROM specified by page and bank numbers along with offset and
length. It is effectively a netlink replacement for get_module_info()
and get_module_eeprom() pair, which is needed due to emergence of
complex non-linear EEPROM layouts.

Signed-off-by: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst |  36 +++++-
 include/linux/ethtool.h                      |  33 +++++-
 include/uapi/linux/ethtool_netlink.h         |  19 +++
 net/ethtool/Makefile                         |   2 +-
 net/ethtool/eeprom.c                         | 171 +++++++++++++++++++++++++++
 net/ethtool/netlink.c                        |  11 ++
 net/ethtool/netlink.h                        |   2 +
 7 files changed, 270 insertions(+), 4 deletions(-)
 create mode 100644 net/ethtool/eeprom.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index ce4a69f8308f..bbecffc7b11a 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1338,6 +1338,38 @@ in an implementation specific way.
 ``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP
 module parameters. This does not mean autonegotiation.
 
+MODULE_EEPROM
+=============
+
+Fetch module EEPROM data dump.
+This interface is designed to allow dumps of at most 1/2 page at once. This
+means only dumps of 128 (or less) bytes are allowed, without crossing half page
+boundary located at offset 128. For pages other than 0 only high 128 bytes are
+accessible.
+
+Request contents:
+
+  =======================================  ======  ==========================
+  ``ETHTOOL_A_MODULE_EEPROM_HEADER``       nested  request header
+  ``ETHTOOL_A_MODULE_EEPROM_OFFSET``       u32     offset within a page
+  ``ETHTOOL_A_MODULE_EEPROM_LENGTH``       u32     amount of bytes to read
+  ``ETHTOOL_A_MODULE_EEPROM_PAGE``         u8      page number
+  ``ETHTOOL_A_MODULE_EEPROM_BANK``         u8      bank number
+  ``ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS``  u8      page I2C address
+  =======================================  ======  ==========================
+
+Kernel response contents:
+
+ +---------------------------------------------+--------+---------------------+
+ | ``ETHTOOL_A_MODULE_EEPROM_HEADER``          | nested | reply header        |
+ +---------------------------------------------+--------+---------------------+
+ | ``ETHTOOL_A_MODULE_EEPROM_DATA``            | nested | array of bytes from |
+ |                                             |        | module EEPROM       |
+ +---------------------------------------------+--------+---------------------+
+
+``ETHTOOL_A_MODULE_EEPROM_DATA`` has an attribute length equal to the amount of
+bytes driver actually read.
+
 Request translation
 ===================
 
@@ -1415,8 +1447,8 @@ are netlink only.
   ``ETHTOOL_GET_DUMP_FLAG``           n/a
   ``ETHTOOL_GET_DUMP_DATA``           n/a
   ``ETHTOOL_GET_TS_INFO``             ``ETHTOOL_MSG_TSINFO_GET``
-  ``ETHTOOL_GMODULEINFO``             n/a
-  ``ETHTOOL_GMODULEEEPROM``           n/a
+  ``ETHTOOL_GMODULEINFO``             ``ETHTOOL_MSG_MODULE_EEPROM_GET``
+  ``ETHTOOL_GMODULEEEPROM``           ``ETHTOOL_MSG_MODULE_EEPROM_GET``
   ``ETHTOOL_GEEE``                    ``ETHTOOL_MSG_EEE_GET``
   ``ETHTOOL_SEEE``                    ``ETHTOOL_MSG_EEE_SET``
   ``ETHTOOL_GRSSH``                   n/a
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 4290e2fa3117..9f6f323af59a 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -81,6 +81,7 @@ enum {
 #define ETH_RSS_HASH_NO_CHANGE	0
 
 struct net_device;
+struct netlink_ext_ack;
 
 /* Some generic methods drivers may use in their ethtool_ops */
 u32 ethtool_op_get_link(struct net_device *dev);
@@ -262,6 +263,31 @@ struct ethtool_pause_stats {
 	u64 rx_pause_frames;
 };
 
+#define ETH_MODULE_EEPROM_PAGE_LEN	128
+#define ETH_MODULE_MAX_I2C_ADDRESS	0x7f
+
+/**
+ * struct ethtool_module_eeprom - EEPROM dump from specified page
+ * @offset: Offset within the specified EEPROM page to begin read, in bytes.
+ * @length: Number of bytes to read.
+ * @page: Page number to read from.
+ * @bank: Page bank number to read from, if applicable by EEPROM spec.
+ * @i2c_address: I2C address of a page. Value less than 0x7f expected. Most
+ *	EEPROMs use 0x50 or 0x51.
+ * @data: Pointer to buffer with EEPROM data of @length size.
+ *
+ * This can be used to manage pages during EEPROM dump in ethtool and pass
+ * required information to the driver.
+ */
+struct ethtool_module_eeprom {
+	__u32	offset;
+	__u32	length;
+	__u8	page;
+	__u8	bank;
+	__u8	i2c_address;
+	__u8	*data;
+};
+
 /**
  * struct ethtool_ops - optional netdev operations
  * @cap_link_lanes_supported: indicates if the driver supports lanes
@@ -414,6 +440,9 @@ struct ethtool_pause_stats {
  *	cannot use the standard PHY library helpers.
  * @get_phy_tunable: Read the value of a PHY tunable.
  * @set_phy_tunable: Set the value of a PHY tunable.
+ * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from
+ *	specified page. Returns a negative error code or the amount of bytes
+ *	read.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -519,6 +548,9 @@ struct ethtool_ops {
 				   const struct ethtool_tunable *, void *);
 	int	(*set_phy_tunable)(struct net_device *,
 				   const struct ethtool_tunable *, const void *);
+	int	(*get_module_eeprom_by_page)(struct net_device *dev,
+					     const struct ethtool_module_eeprom *page,
+					     struct netlink_ext_ack *extack);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
@@ -542,7 +574,6 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev,
 				       const struct ethtool_link_ksettings *cmd,
 				       u32 *dev_speed, u8 *dev_duplex);
 
-struct netlink_ext_ack;
 struct phy_device;
 struct phy_tdr_config;
 
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 7f1bdb5b31ba..9612dcd48a6a 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -44,6 +44,7 @@ enum {
 	ETHTOOL_MSG_TUNNEL_INFO_GET,
 	ETHTOOL_MSG_FEC_GET,
 	ETHTOOL_MSG_FEC_SET,
+	ETHTOOL_MSG_MODULE_EEPROM_GET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -84,6 +85,7 @@ enum {
 	ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY,
 	ETHTOOL_MSG_FEC_GET_REPLY,
 	ETHTOOL_MSG_FEC_NTF,
+	ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -646,6 +648,23 @@ enum {
 	ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1)
 };
 
+/* MODULE EEPROM */
+
+enum {
+	ETHTOOL_A_MODULE_EEPROM_UNSPEC,
+	ETHTOOL_A_MODULE_EEPROM_HEADER,			/* nest - _A_HEADER_* */
+
+	ETHTOOL_A_MODULE_EEPROM_OFFSET,			/* u32 */
+	ETHTOOL_A_MODULE_EEPROM_LENGTH,			/* u32 */
+	ETHTOOL_A_MODULE_EEPROM_PAGE,			/* u8 */
+	ETHTOOL_A_MODULE_EEPROM_BANK,			/* u8 */
+	ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS,		/* u8 */
+	ETHTOOL_A_MODULE_EEPROM_DATA,			/* nested */
+
+	__ETHTOOL_A_MODULE_EEPROM_CNT,
+	ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index c2dc9033a8f7..83842685fd8c 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-		   tunnels.o fec.o
+		   tunnels.o fec.o eeprom.o
diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c
new file mode 100644
index 000000000000..8536dd905da5
--- /dev/null
+++ b/net/ethtool/eeprom.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include "netlink.h"
+#include "common.h"
+
+struct eeprom_req_info {
+	struct ethnl_req_info	base;
+	u32			offset;
+	u32			length;
+	u8			page;
+	u8			bank;
+	u8			i2c_address;
+};
+
+struct eeprom_reply_data {
+	struct ethnl_reply_data base;
+	u32			length;
+	u8			*data;
+};
+
+#define MODULE_EEPROM_REQINFO(__req_base) \
+	container_of(__req_base, struct eeprom_req_info, base)
+
+#define MODULE_EEPROM_REPDATA(__reply_base) \
+	container_of(__reply_base, struct eeprom_reply_data, base)
+
+static int eeprom_prepare_data(const struct ethnl_req_info *req_base,
+			       struct ethnl_reply_data *reply_base,
+			       struct genl_info *info)
+{
+	struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+	struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base);
+	struct ethtool_module_eeprom page_data = {0};
+	struct net_device *dev = reply_base->dev;
+	int ret;
+
+	if (!dev->ethtool_ops->get_module_eeprom_by_page)
+		return -EOPNOTSUPP;
+
+	page_data.offset = request->offset;
+	page_data.length = request->length;
+	page_data.i2c_address = request->i2c_address;
+	page_data.page = request->page;
+	page_data.bank = request->bank;
+	page_data.data = kmalloc(page_data.length, GFP_KERNEL);
+	if (!page_data.data)
+		return -ENOMEM;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret)
+		goto err_free;
+
+	ret = dev->ethtool_ops->get_module_eeprom_by_page(dev, &page_data,
+							  info->extack);
+	if (ret < 0)
+		goto err_ops;
+
+	reply->length = ret;
+	reply->data = page_data.data;
+
+	ethnl_ops_complete(dev);
+	return 0;
+
+err_ops:
+	ethnl_ops_complete(dev);
+err_free:
+	kfree(page_data.data);
+	return ret;
+}
+
+static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb,
+				struct netlink_ext_ack *extack)
+{
+	struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_info);
+
+	if (!tb[ETHTOOL_A_MODULE_EEPROM_OFFSET] ||
+	    !tb[ETHTOOL_A_MODULE_EEPROM_LENGTH] ||
+	    !tb[ETHTOOL_A_MODULE_EEPROM_PAGE] ||
+	    !tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS])
+		return -EINVAL;
+
+	request->i2c_address = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]);
+	request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]);
+	request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]);
+
+	if (!request->length)
+		return -EINVAL;
+
+	/* The following set of conditions limit the API to only dump 1/2
+	 * EEPROM page without crossing low page boundary located at offset 128.
+	 * This means user may only request dumps of length limited to 128 from
+	 * either low 128 bytes or high 128 bytes.
+	 * For pages higher than 0 only high 128 bytes are accessible.
+	 */
+	request->page = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_PAGE]);
+	if (request->page && request->offset < ETH_MODULE_EEPROM_PAGE_LEN) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_PAGE],
+				    "reading from lower half page is allowed for page 0 only");
+		return -EINVAL;
+	}
+
+	if (request->offset < ETH_MODULE_EEPROM_PAGE_LEN &&
+	    request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH],
+				    "reading cross half page boundary is illegal");
+		return -EINVAL;
+	} else if (request->offset >= ETH_MODULE_EEPROM_PAGE_LEN * 2) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_OFFSET],
+				    "offset is out of bounds");
+		return -EINVAL;
+	} else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH],
+				    "reading cross page boundary is illegal");
+		return -EINVAL;
+	}
+
+	if (tb[ETHTOOL_A_MODULE_EEPROM_BANK])
+		request->bank = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_BANK]);
+
+	return 0;
+}
+
+static int eeprom_reply_size(const struct ethnl_req_info *req_base,
+			     const struct ethnl_reply_data *reply_base)
+{
+	const struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base);
+
+	return nla_total_size(sizeof(u8) * request->length); /* _EEPROM_DATA */
+}
+
+static int eeprom_fill_reply(struct sk_buff *skb,
+			     const struct ethnl_req_info *req_base,
+			     const struct ethnl_reply_data *reply_base)
+{
+	struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+
+	return nla_put(skb, ETHTOOL_A_MODULE_EEPROM_DATA, reply->length, reply->data);
+}
+
+static void eeprom_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+	struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+
+	kfree(reply->data);
+}
+
+const struct ethnl_request_ops ethnl_module_eeprom_request_ops = {
+	.request_cmd		= ETHTOOL_MSG_MODULE_EEPROM_GET,
+	.reply_cmd		= ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
+	.hdr_attr		= ETHTOOL_A_MODULE_EEPROM_HEADER,
+	.req_info_size		= sizeof(struct eeprom_req_info),
+	.reply_data_size	= sizeof(struct eeprom_reply_data),
+
+	.parse_request		= eeprom_parse_request,
+	.prepare_data		= eeprom_prepare_data,
+	.reply_size		= eeprom_reply_size,
+	.fill_reply		= eeprom_fill_reply,
+	.cleanup_data		= eeprom_cleanup_data,
+};
+
+const struct nla_policy ethnl_module_eeprom_get_policy[] = {
+	[ETHTOOL_A_MODULE_EEPROM_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_MODULE_EEPROM_OFFSET]	= { .type = NLA_U32 },
+	[ETHTOOL_A_MODULE_EEPROM_LENGTH]	= { .type = NLA_U32 },
+	[ETHTOOL_A_MODULE_EEPROM_PAGE]		= { .type = NLA_U8 },
+	[ETHTOOL_A_MODULE_EEPROM_BANK]		= { .type = NLA_U8 },
+	[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]	=
+		NLA_POLICY_RANGE(NLA_U8, 0, ETH_MODULE_MAX_I2C_ADDRESS),
+};
+
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 705a4b201564..5f5d7c4b3d4a 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -246,6 +246,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_EEE_GET]		= &ethnl_eee_request_ops,
 	[ETHTOOL_MSG_FEC_GET]		= &ethnl_fec_request_ops,
 	[ETHTOOL_MSG_TSINFO_GET]	= &ethnl_tsinfo_request_ops,
+	[ETHTOOL_MSG_MODULE_EEPROM_GET]	= &ethnl_module_eeprom_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -931,6 +932,16 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_fec_set_policy,
 		.maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_MODULE_EEPROM_GET,
+		.flags  = GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_default_start,
+		.dumpit	= ethnl_default_dumpit,
+		.done	= ethnl_default_done,
+		.policy = ethnl_module_eeprom_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 785f7ee45930..4305ac971bb0 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -345,6 +345,7 @@ extern const struct ethnl_request_ops ethnl_pause_request_ops;
 extern const struct ethnl_request_ops ethnl_eee_request_ops;
 extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
 extern const struct ethnl_request_ops ethnl_fec_request_ops;
+extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -378,6 +379,7 @@ extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_T
 extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1];
 extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
 extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
+extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
-- 
cgit v1.2.3-71-gd317


From 88e41cf928a6e1a0eb5a9492e2d091ec6193cce4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Feb 2021 22:08:01 -0700
Subject: io_uring: add multishot mode for IORING_OP_POLL_ADD

The default io_uring poll mode is one-shot, where once the event triggers,
the poll command is completed and won't trigger any further events. If
we're doing repeated polling on the same file or socket, then it can be
more efficient to do multishot, where we keep triggering whenever the
event becomes true.

This deviates from the usual norm of having one CQE per SQE submitted. Add
a CQE flag, IORING_CQE_F_MORE, which tells the application to expect
further completion events from the submitted SQE. Right now the only user
of this is POLL_ADD in multishot mode.

Since sqe->poll_events is using the space that we normally use for adding
flags to commands, use sqe->len for the flag space for POLL_ADD. Multishot
mode is selected by setting IORING_POLL_ADD_MULTI in sqe->len. An
application should expect more CQEs for the specificed SQE if the CQE is
flagged with IORING_CQE_F_MORE. In multishot mode, only cancelation or an
error will terminate the poll request, in which case the flag will be
cleared.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 64 +++++++++++++++++++++++++++++++------------
 include/uapi/linux/io_uring.h | 12 ++++++++
 2 files changed, 58 insertions(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e5c2bb258db0..80db5898e119 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4898,17 +4898,25 @@ static void io_poll_remove_double(struct io_kiocb *req)
 	}
 }
 
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
+static bool io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	unsigned flags = IORING_CQE_F_MORE;
 
-	if (!error && req->poll.canceled)
+	if (!error && req->poll.canceled) {
 		error = -ECANCELED;
-
-	io_poll_remove_double(req);
-	req->poll.done = true;
-	io_cqring_fill_event(req, error ? error : mangle_poll(mask));
+		req->poll.events |= EPOLLONESHOT;
+	}
+	if (error || (req->poll.events & EPOLLONESHOT)) {
+		io_poll_remove_double(req);
+		req->poll.done = true;
+		flags = 0;
+	}
+	if (!error)
+		error = mangle_poll(mask);
+	__io_cqring_fill_event(req, error, flags);
 	io_commit_cqring(ctx);
+	return !(flags & IORING_CQE_F_MORE);
 }
 
 static void io_poll_task_func(struct callback_head *cb)
@@ -4920,14 +4928,25 @@ static void io_poll_task_func(struct callback_head *cb)
 	if (io_poll_rewait(req, &req->poll)) {
 		spin_unlock_irq(&ctx->completion_lock);
 	} else {
-		hash_del(&req->hash_node);
-		io_poll_complete(req, req->result, 0);
+		bool done, post_ev;
+
+		post_ev = done = io_poll_complete(req, req->result, 0);
+		if (done) {
+			hash_del(&req->hash_node);
+		} else if (!(req->poll.events & EPOLLONESHOT)) {
+			post_ev = true;
+			req->result = 0;
+			add_wait_queue(req->poll.head, &req->poll.wait);
+		}
 		spin_unlock_irq(&ctx->completion_lock);
 
-		nxt = io_put_req_find_next(req);
-		io_cqring_ev_posted(ctx);
-		if (nxt)
-			__io_req_task_submit(nxt);
+		if (post_ev)
+			io_cqring_ev_posted(ctx);
+		if (done) {
+			nxt = io_put_req_find_next(req);
+			if (nxt)
+				__io_req_task_submit(nxt);
+		}
 	}
 }
 
@@ -4941,6 +4960,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
 	/* for instances that support it check for an event match first: */
 	if (mask && !(mask & poll->events))
 		return 0;
+	if (!(poll->events & EPOLLONESHOT))
+		return poll->wait.func(&poll->wait, mode, sync, key);
 
 	list_del_init(&wait->entry);
 
@@ -5106,7 +5127,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
 			ipt->error = 0;
 			mask = 0;
 		}
-		if (mask || ipt->error)
+		if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
 			list_del_init(&poll->wait.entry);
 		else if (cancel)
 			WRITE_ONCE(poll->canceled, true);
@@ -5149,7 +5170,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 	req->flags |= REQ_F_POLLED;
 	req->apoll = apoll;
 
-	mask = 0;
+	mask = EPOLLONESHOT;
 	if (def->pollin)
 		mask |= POLLIN | POLLRDNORM;
 	if (def->pollout)
@@ -5322,18 +5343,24 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_poll_iocb *poll = &req->poll;
-	u32 events;
+	u32 events, flags;
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+	if (sqe->addr || sqe->ioprio || sqe->off || sqe->buf_index)
+		return -EINVAL;
+	flags = READ_ONCE(sqe->len);
+	if (flags & ~IORING_POLL_ADD_MULTI)
 		return -EINVAL;
 
 	events = READ_ONCE(sqe->poll32_events);
 #ifdef __BIG_ENDIAN
 	events = swahw32(events);
 #endif
-	poll->events = demangle_poll(events) | (events & EPOLLEXCLUSIVE);
+	if (!flags)
+		events |= EPOLLONESHOT;
+	poll->events = demangle_poll(events) |
+				(events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 	return 0;
 }
 
@@ -5357,7 +5384,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (mask) {
 		io_cqring_ev_posted(ctx);
-		io_put_req(req);
+		if (poll->events & EPOLLONESHOT)
+			io_put_req(req);
 	}
 	return ipt.error;
 }
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 2514eb6b1cf2..76c967621601 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -159,6 +159,16 @@ enum {
  */
 #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
 
+/*
+ * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
+ * command flags for POLL_ADD are stored in sqe->len.
+ *
+ * IORING_POLL_ADD_MULTI	Multishot poll. Sets IORING_CQE_F_MORE if
+ *				the poll handler will continue to report
+ *				CQEs on behalf of the same SQE.
+ */
+#define IORING_POLL_ADD_MULTI	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
@@ -172,8 +182,10 @@ struct io_uring_cqe {
  * cqe->flags
  *
  * IORING_CQE_F_BUFFER	If set, the upper 16 bits are the buffer ID
+ * IORING_CQE_F_MORE	If set, parent SQE will generate more CQE entries
  */
 #define IORING_CQE_F_BUFFER		(1U << 0)
+#define IORING_CQE_F_MORE		(1U << 1)
 
 enum {
 	IORING_CQE_BUFFER_SHIFT		= 16,
-- 
cgit v1.2.3-71-gd317


From b69de288e913030082bed3a324ddc58be6c1e983 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 17 Mar 2021 08:37:41 -0600
Subject: io_uring: allow events and user_data update of running poll requests

This adds two new POLL_ADD flags, IORING_POLL_UPDATE_EVENTS and
IORING_POLL_UPDATE_USER_DATA. As with the other POLL_ADD flag, these are
masked into sqe->len. If set, the POLL_ADD will have the following
behavior:

- sqe->addr must contain the the user_data of the poll request that
  needs to be modified. This field is otherwise invalid for a POLL_ADD
  command.

- If IORING_POLL_UPDATE_EVENTS is set, sqe->poll_events must contain the
  new mask for the existing poll request. There are no checks for whether
  these are identical or not, if a matching poll request is found, then it
  is re-armed with the new mask.

- If IORING_POLL_UPDATE_USER_DATA is set, sqe->off must contain the new
  user_data for the existing poll request.

A POLL_ADD with any of these flags set may complete with any of the
following results:

1) 0, which means that we successfully found the existing poll request
   specified, and performed the re-arm procedure. Any error from that
   re-arm will be exposed as a completion event for that original poll
   request, not for the update request.
2) -ENOENT, if no existing poll request was found with the given
   user_data.
3) -EALREADY, if the existing poll request was already in the process of
   being removed/canceled/completing.
4) -EACCES, if an attempt was made to modify an internal poll request
   (eg not one originally issued ass IORING_OP_POLL_ADD).

The usual -EINVAL cases apply as well, if any invalid fields are set
in the sqe for this command type.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 95 +++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/io_uring.h |  5 +++
 2 files changed, 92 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3c54e8c9f81f..eeb165253491 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -486,7 +486,15 @@ struct io_poll_iocb {
 	__poll_t			events;
 	bool				done;
 	bool				canceled;
-	struct wait_queue_entry		wait;
+	bool				update_events;
+	bool				update_user_data;
+	union {
+		struct wait_queue_entry	wait;
+		struct {
+			u64		old_user_data;
+			u64		new_user_data;
+		};
+	};
 };
 
 struct io_poll_remove {
@@ -4911,8 +4919,9 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
 	}
 	if (!error)
 		error = mangle_poll(mask);
-	if (!__io_cqring_fill_event(req, error, flags) ||
-	    (req->poll.events & EPOLLONESHOT)) {
+	if (req->poll.events & EPOLLONESHOT)
+		flags = 0;
+	if (!__io_cqring_fill_event(req, error, flags)) {
 		io_poll_remove_waitqs(req);
 		req->poll.done = true;
 		flags = 0;
@@ -4992,6 +5001,7 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
 	poll->head = NULL;
 	poll->done = false;
 	poll->canceled = false;
+	poll->update_events = poll->update_user_data = false;
 #define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
 	/* mask in events that we always want/need */
 	poll->events = events | IO_POLL_UNMASK;
@@ -5370,24 +5380,36 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->addr || sqe->ioprio || sqe->off || sqe->buf_index)
+	if (sqe->ioprio || sqe->buf_index)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
-	if (flags & ~IORING_POLL_ADD_MULTI)
+	if (flags & ~(IORING_POLL_ADD_MULTI | IORING_POLL_UPDATE_EVENTS |
+			IORING_POLL_UPDATE_USER_DATA))
 		return -EINVAL;
-
 	events = READ_ONCE(sqe->poll32_events);
 #ifdef __BIG_ENDIAN
 	events = swahw32(events);
 #endif
-	if (!flags)
+	if (!(flags & IORING_POLL_ADD_MULTI))
 		events |= EPOLLONESHOT;
+	poll->update_events = poll->update_user_data = false;
+	if (flags & IORING_POLL_UPDATE_EVENTS) {
+		poll->update_events = true;
+		poll->old_user_data = READ_ONCE(sqe->addr);
+	}
+	if (flags & IORING_POLL_UPDATE_USER_DATA) {
+		poll->update_user_data = true;
+		poll->new_user_data = READ_ONCE(sqe->off);
+	}
+	if (!(poll->update_events || poll->update_user_data) &&
+	     (sqe->off || sqe->addr))
+		return -EINVAL;
 	poll->events = demangle_poll(events) |
 				(events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 	return 0;
 }
 
-static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_poll_add(struct io_kiocb *req)
 {
 	struct io_poll_iocb *poll = &req->poll;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -5413,6 +5435,63 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 	return ipt.error;
 }
 
+static int io_poll_update(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *preq;
+	int ret;
+
+	spin_lock_irq(&ctx->completion_lock);
+	preq = io_poll_find(ctx, req->poll.old_user_data);
+	if (!preq) {
+		ret = -ENOENT;
+		goto err;
+	} else if (preq->opcode != IORING_OP_POLL_ADD) {
+		/* don't allow internal poll updates */
+		ret = -EACCES;
+		goto err;
+	}
+	if (!__io_poll_remove_one(preq, &preq->poll)) {
+		/* in process of completing/removal */
+		ret = -EALREADY;
+		goto err;
+	}
+	/* we now have a detached poll request. reissue. */
+	ret = 0;
+err:
+	spin_unlock_irq(&ctx->completion_lock);
+	if (ret < 0) {
+		req_set_fail_links(req);
+		io_req_complete(req, ret);
+		return 0;
+	}
+	/* only mask one event flags, keep behavior flags */
+	if (req->poll.update_events) {
+		preq->poll.events &= ~0xffff;
+		preq->poll.events |= req->poll.events & 0xffff;
+		preq->poll.events |= IO_POLL_UNMASK;
+	}
+	if (req->poll.update_user_data)
+		preq->user_data = req->poll.new_user_data;
+
+	/* complete update request, we're done with it */
+	io_req_complete(req, ret);
+
+	ret = __io_poll_add(preq);
+	if (ret < 0) {
+		req_set_fail_links(preq);
+		io_req_complete(preq, ret);
+	}
+	return 0;
+}
+
+static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
+{
+	if (!req->poll.update_events && !req->poll.update_user_data)
+		return __io_poll_add(req);
+	return io_poll_update(req);
+}
+
 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 {
 	struct io_timeout_data *data = container_of(timer,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 76c967621601..5beaa6bbc6db 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -166,8 +166,13 @@ enum {
  * IORING_POLL_ADD_MULTI	Multishot poll. Sets IORING_CQE_F_MORE if
  *				the poll handler will continue to report
  *				CQEs on behalf of the same SQE.
+ *
+ * IORING_POLL_UPDATE		Update existing poll request, matching
+ *				sqe->addr as the old user_data field.
  */
 #define IORING_POLL_ADD_MULTI	(1U << 0)
+#define IORING_POLL_UPDATE_EVENTS	(1U << 1)
+#define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
 
 /*
  * IO completion data structure (Completion Queue Entry)
-- 
cgit v1.2.3-71-gd317


From 4ac823e9cd85f66da274c951d21bf9f6b714b729 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 22 Mar 2021 16:36:25 -0700
Subject: dmaengine: idxd: fix delta_rec and crc size field for completion
 record

The delta_rec_size and crc_val in the completion record should
be 32bits and not 16bits.

Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Reported-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161645618572.2003490.14466173451736323035.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/uapi/linux/idxd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index 236d437947bc..e33997b4d750 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -247,8 +247,8 @@ struct dsa_completion_record {
 			uint32_t	rsvd2:8;
 		};
 
-		uint16_t	delta_rec_size;
-		uint16_t	crc_val;
+		uint32_t	delta_rec_size;
+		uint32_t	crc_val;
 
 		/* DIF check & strip */
 		struct {
-- 
cgit v1.2.3-71-gd317


From f3c45326ee71d1d3ec11e9ddb5afc04bca9ae492 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@cilium.io>
Date: Sat, 10 Apr 2021 10:45:48 -0700
Subject: bpf: Document PROG_TEST_RUN limitations

Per net/bpf/test_run.c, particular prog types have additional
restrictions around the parameters that can be provided, so document
these in the header.

I didn't bother documenting the limitation on duration for raw
tracepoints since that's an output parameter anyway.

Tested with ./tools/testing/selftests/bpf/test_doc_build.sh.

Suggested-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Joe Stringer <joe@cilium.io>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Lorenz Bauer <lmb@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210410174549.816482-1-joe@cilium.io
---
 include/uapi/linux/bpf.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 49371eba98ba..e1ee1be7e49b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -312,6 +312,27 @@ union bpf_iter_link_info {
  *		*ctx_out*, *data_out* (for example, packet data), result of the
  *		execution *retval*, and *duration* of the test run.
  *
+ *		The sizes of the buffers provided as input and output
+ *		parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must
+ *		be provided in the corresponding variables *ctx_size_in*,
+ *		*ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any
+ *		of these parameters are not provided (ie set to NULL), the
+ *		corresponding size field must be zero.
+ *
+ *		Some program types have particular requirements:
+ *
+ *		**BPF_PROG_TYPE_SK_LOOKUP**
+ *			*data_in* and *data_out* must be NULL.
+ *
+ *		**BPF_PROG_TYPE_XDP**
+ *			*ctx_in* and *ctx_out* must be NULL.
+ *
+ *		**BPF_PROG_TYPE_RAW_TRACEPOINT**,
+ *		**BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE**
+ *
+ *			*ctx_out*, *data_in* and *data_out* must be NULL.
+ *			*repeat* must be zero.
+ *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
-- 
cgit v1.2.3-71-gd317


From 5c507329000e282dce91e6c98ee6ffa61a8a5e49 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@gmail.com>
Date: Mon, 12 Apr 2021 16:24:32 -0300
Subject: libbpf: Clarify flags in ringbuf helpers

In 'bpf_ringbuf_reserve()' we require the flag to '0' at the moment.

For 'bpf_ringbuf_{discard,submit,output}' a flag of '0' might send a
notification to the process if needed.

Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210412192434.944343-1-pctammela@mojatatu.com
---
 include/uapi/linux/bpf.h       | 16 ++++++++++++++++
 tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e1ee1be7e49b..85c924bc21b1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4082,12 +4082,20 @@ union bpf_attr {
  * 		of new data availability is sent.
  * 		If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
  * 		of new data availability is sent unconditionally.
+ * 		If **0** is specified in *flags*, an adaptive notification
+ * 		of new data availability is sent.
+ *
+ * 		An adaptive notification is a notification sent whenever the user-space
+ * 		process has caught up and consumed all available payloads. In case the user-space
+ * 		process is still processing a previous payload, then no notification is needed
+ * 		as it will process the newly added payload automatically.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
  * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags)
  * 	Description
  * 		Reserve *size* bytes of payload in a ring buffer *ringbuf*.
+ * 		*flags* must be 0.
  * 	Return
  * 		Valid pointer with *size* bytes of memory available; NULL,
  * 		otherwise.
@@ -4099,6 +4107,10 @@ union bpf_attr {
  * 		of new data availability is sent.
  * 		If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
  * 		of new data availability is sent unconditionally.
+ * 		If **0** is specified in *flags*, an adaptive notification
+ * 		of new data availability is sent.
+ *
+ * 		See 'bpf_ringbuf_output()' for the definition of adaptive notification.
  * 	Return
  * 		Nothing. Always succeeds.
  *
@@ -4109,6 +4121,10 @@ union bpf_attr {
  * 		of new data availability is sent.
  * 		If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
  * 		of new data availability is sent unconditionally.
+ * 		If **0** is specified in *flags*, an adaptive notification
+ * 		of new data availability is sent.
+ *
+ * 		See 'bpf_ringbuf_output()' for the definition of adaptive notification.
  * 	Return
  * 		Nothing. Always succeeds.
  *
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e1ee1be7e49b..85c924bc21b1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4082,12 +4082,20 @@ union bpf_attr {
  * 		of new data availability is sent.
  * 		If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
  * 		of new data availability is sent unconditionally.
+ * 		If **0** is specified in *flags*, an adaptive notification
+ * 		of new data availability is sent.
+ *
+ * 		An adaptive notification is a notification sent whenever the user-space
+ * 		process has caught up and consumed all available payloads. In case the user-space
+ * 		process is still processing a previous payload, then no notification is needed
+ * 		as it will process the newly added payload automatically.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
  * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags)
  * 	Description
  * 		Reserve *size* bytes of payload in a ring buffer *ringbuf*.
+ * 		*flags* must be 0.
  * 	Return
  * 		Valid pointer with *size* bytes of memory available; NULL,
  * 		otherwise.
@@ -4099,6 +4107,10 @@ union bpf_attr {
  * 		of new data availability is sent.
  * 		If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
  * 		of new data availability is sent unconditionally.
+ * 		If **0** is specified in *flags*, an adaptive notification
+ * 		of new data availability is sent.
+ *
+ * 		See 'bpf_ringbuf_output()' for the definition of adaptive notification.
  * 	Return
  * 		Nothing. Always succeeds.
  *
@@ -4109,6 +4121,10 @@ union bpf_attr {
  * 		of new data availability is sent.
  * 		If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
  * 		of new data availability is sent unconditionally.
+ * 		If **0** is specified in *flags*, an adaptive notification
+ * 		of new data availability is sent.
+ *
+ * 		See 'bpf_ringbuf_output()' for the definition of adaptive notification.
  * 	Return
  * 		Nothing. Always succeeds.
  *
-- 
cgit v1.2.3-71-gd317


From 655cdafdec1105d0552aa19ffb5ffef7aead1548 Mon Sep 17 00:00:00 2001
From: Zhang Yunkai <zhang.yunkai@zte.com.cn>
Date: Tue, 13 Apr 2021 10:52:56 +0000
Subject: lightnvm: remove duplicate include in lightnvm.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'linux/blkdev.h' and 'uapi/linux/lightnvm.h' included in 'lightnvm.h'
is duplicated.It is also included in the 5th and 7th line.

Signed-off-by: Zhang Yunkai <zhang.yunkai@zte.com.cn>
Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
Link: https://lore.kernel.org/r/20210413105257.159260-4-matias.bjorling@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h      | 2 --
 include/uapi/linux/lightnvm.h | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 1db223710b28..0908abda9c1b 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -112,10 +112,8 @@ struct nvm_dev_ops {
 
 #ifdef CONFIG_NVM
 
-#include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/dmapool.h>
-#include <uapi/linux/lightnvm.h>
 
 enum {
 	/* HW Responsibilities */
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index ead2e72e5c88..2745afd9b8fa 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -22,7 +22,6 @@
 
 #ifdef __KERNEL__
 #include <linux/const.h>
-#include <linux/ioctl.h>
 #else /* __KERNEL__ */
 #include <stdio.h>
 #include <sys/ioctl.h>
-- 
cgit v1.2.3-71-gd317


From 201698626fbca1cf1a3b686ba14cf2a056500716 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Thu, 18 Mar 2021 20:10:53 -0700
Subject: arm64: Introduce prctl(PR_PAC_{SET,GET}_ENABLED_KEYS)

This change introduces a prctl that allows the user program to control
which PAC keys are enabled in a particular task. The main reason
why this is useful is to enable a userspace ABI that uses PAC to
sign and authenticate function pointers and other pointers exposed
outside of the function, while still allowing binaries conforming
to the ABI to interoperate with legacy binaries that do not sign or
authenticate pointers.

The idea is that a dynamic loader or early startup code would issue
this prctl very early after establishing that a process may load legacy
binaries, but before executing any PAC instructions.

This change adds a small amount of overhead to kernel entry and exit
due to additional required instruction sequences.

On a DragonBoard 845c (Cortex-A75) with the powersave governor, the
overhead of similar instruction sequences was measured as 4.9ns when
simulating the common case where IA is left enabled, or 43.7ns when
simulating the uncommon case where IA is disabled. These numbers can
be seen as the worst case scenario, since in more realistic scenarios
a better performing governor would be used and a newer chip would be
used that would support PAC unlike Cortex-A75 and would be expected
to be faster than Cortex-A75.

On an Apple M1 under a hypervisor, the overhead of the entry/exit
instruction sequences introduced by this patch was measured as 0.3ns
in the case where IA is left enabled, and 33.0ns in the case where
IA is disabled.

Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Link: https://linux-review.googlesource.com/id/Ibc41a5e6a76b275efbaa126b31119dc197b927a5
Link: https://lore.kernel.org/r/d6609065f8f40397a4124654eb68c9f490b4d477.1616123271.git.pcc@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arm64/pointer-authentication.rst | 34 ++++++++++++++
 arch/arm64/include/asm/mte.h                   |  4 +-
 arch/arm64/include/asm/pointer_auth.h          | 25 +++++++++--
 arch/arm64/include/asm/processor.h             |  9 +++-
 arch/arm64/include/asm/sysreg.h                |  4 +-
 arch/arm64/kernel/asm-offsets.c                |  1 +
 arch/arm64/kernel/entry.S                      | 39 +++++++++++++++-
 arch/arm64/kernel/mte.c                        |  2 +-
 arch/arm64/kernel/pointer_auth.c               | 62 ++++++++++++++++++++++++++
 arch/arm64/kernel/process.c                    | 10 +++--
 arch/arm64/kernel/ptrace.c                     | 41 +++++++++++++++++
 include/uapi/linux/elf.h                       |  1 +
 include/uapi/linux/prctl.h                     |  4 ++
 kernel/sys.c                                   | 16 +++++++
 14 files changed, 239 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/arm64/pointer-authentication.rst b/Documentation/arm64/pointer-authentication.rst
index 30b2ab06526b..f127666ea3a8 100644
--- a/Documentation/arm64/pointer-authentication.rst
+++ b/Documentation/arm64/pointer-authentication.rst
@@ -107,3 +107,37 @@ filter out the Pointer Authentication system key registers from
 KVM_GET/SET_REG_* ioctls and mask those features from cpufeature ID
 register. Any attempt to use the Pointer Authentication instructions will
 result in an UNDEFINED exception being injected into the guest.
+
+
+Enabling and disabling keys
+---------------------------
+
+The prctl PR_PAC_SET_ENABLED_KEYS allows the user program to control which
+PAC keys are enabled in a particular task. It takes two arguments, the
+first being a bitmask of PR_PAC_APIAKEY, PR_PAC_APIBKEY, PR_PAC_APDAKEY
+and PR_PAC_APDBKEY specifying which keys shall be affected by this prctl,
+and the second being a bitmask of the same bits specifying whether the key
+should be enabled or disabled. For example::
+
+  prctl(PR_PAC_SET_ENABLED_KEYS,
+        PR_PAC_APIAKEY | PR_PAC_APIBKEY | PR_PAC_APDAKEY | PR_PAC_APDBKEY,
+        PR_PAC_APIBKEY, 0, 0);
+
+disables all keys except the IB key.
+
+The main reason why this is useful is to enable a userspace ABI that uses PAC
+instructions to sign and authenticate function pointers and other pointers
+exposed outside of the function, while still allowing binaries conforming to
+the ABI to interoperate with legacy binaries that do not sign or authenticate
+pointers.
+
+The idea is that a dynamic loader or early startup code would issue this
+prctl very early after establishing that a process may load legacy binaries,
+but before executing any PAC instructions.
+
+For compatibility with previous kernel versions, processes start up with IA,
+IB, DA and DB enabled, and are reset to this state on exec(). Processes created
+via fork() and clone() inherit the key enabled state from the calling process.
+
+It is recommended to avoid disabling the IA key, as this has higher performance
+overhead than disabling any of the other keys.
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index a38abc15186c..5d6a53cca5d5 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -39,7 +39,7 @@ void mte_free_tag_storage(char *storage);
 
 void mte_sync_tags(pte_t *ptep, pte_t pte);
 void mte_copy_page_tags(void *kto, const void *kfrom);
-void flush_mte_state(void);
+void mte_thread_init_user(void);
 void mte_thread_switch(struct task_struct *next);
 void mte_suspend_enter(void);
 void mte_suspend_exit(void);
@@ -61,7 +61,7 @@ static inline void mte_sync_tags(pte_t *ptep, pte_t pte)
 static inline void mte_copy_page_tags(void *kto, const void *kfrom)
 {
 }
-static inline void flush_mte_state(void)
+static inline void mte_thread_init_user(void)
 {
 }
 static inline void mte_thread_switch(struct task_struct *next)
diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h
index b112a11e9302..cefe7e9b6fa9 100644
--- a/arch/arm64/include/asm/pointer_auth.h
+++ b/arch/arm64/include/asm/pointer_auth.h
@@ -3,6 +3,7 @@
 #define __ASM_POINTER_AUTH_H
 
 #include <linux/bitops.h>
+#include <linux/prctl.h>
 #include <linux/random.h>
 
 #include <asm/cpufeature.h>
@@ -71,6 +72,10 @@ static __always_inline void ptrauth_keys_switch_kernel(struct ptrauth_keys_kerne
 
 extern int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg);
 
+extern int ptrauth_set_enabled_keys(struct task_struct *tsk, unsigned long keys,
+				    unsigned long enabled);
+extern int ptrauth_get_enabled_keys(struct task_struct *tsk);
+
 static inline unsigned long ptrauth_strip_insn_pac(unsigned long ptr)
 {
 	return ptrauth_clear_pac(ptr);
@@ -85,8 +90,17 @@ static __always_inline void ptrauth_enable(void)
 	isb();
 }
 
-#define ptrauth_thread_init_user(tsk)					\
-	ptrauth_keys_init_user(&(tsk)->thread.keys_user)
+#define ptrauth_thread_init_user()                                             \
+	do {                                                                   \
+		ptrauth_keys_init_user(&current->thread.keys_user);            \
+									       \
+		/* enable all keys */                                          \
+		if (system_supports_address_auth())                            \
+			set_task_sctlr_el1(current->thread.sctlr_user |        \
+					   SCTLR_ELx_ENIA | SCTLR_ELx_ENIB |   \
+					   SCTLR_ELx_ENDA | SCTLR_ELx_ENDB);   \
+	} while (0)
+
 #define ptrauth_thread_init_kernel(tsk)					\
 	ptrauth_keys_init_kernel(&(tsk)->thread.keys_kernel)
 #define ptrauth_thread_switch_kernel(tsk)				\
@@ -95,10 +109,15 @@ static __always_inline void ptrauth_enable(void)
 #else /* CONFIG_ARM64_PTR_AUTH */
 #define ptrauth_enable()
 #define ptrauth_prctl_reset_keys(tsk, arg)	(-EINVAL)
+#define ptrauth_set_enabled_keys(tsk, keys, enabled)	(-EINVAL)
+#define ptrauth_get_enabled_keys(tsk)	(-EINVAL)
 #define ptrauth_strip_insn_pac(lr)	(lr)
-#define ptrauth_thread_init_user(tsk)
+#define ptrauth_thread_init_user()
 #define ptrauth_thread_init_kernel(tsk)
 #define ptrauth_thread_switch_kernel(tsk)
 #endif /* CONFIG_ARM64_PTR_AUTH */
 
+#define PR_PAC_ENABLED_KEYS_MASK                                               \
+	(PR_PAC_APIAKEY | PR_PAC_APIBKEY | PR_PAC_APDAKEY | PR_PAC_APDBKEY)
+
 #endif /* __ASM_POINTER_AUTH_H */
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 80895bb30490..2f21c76324bb 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -156,7 +156,9 @@ struct thread_struct {
 	u64			sctlr_user;
 };
 
-#define SCTLR_USER_MASK SCTLR_EL1_TCF0_MASK
+#define SCTLR_USER_MASK                                                        \
+	(SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | SCTLR_ELx_ENDA | SCTLR_ELx_ENDB |   \
+	 SCTLR_EL1_TCF0_MASK)
 
 static inline void arch_thread_struct_whitelist(unsigned long *offset,
 						unsigned long *size)
@@ -305,6 +307,11 @@ extern void __init minsigstksz_setup(void);
 /* PR_PAC_RESET_KEYS prctl */
 #define PAC_RESET_KEYS(tsk, arg)	ptrauth_prctl_reset_keys(tsk, arg)
 
+/* PR_PAC_{SET,GET}_ENABLED_KEYS prctl */
+#define PAC_SET_ENABLED_KEYS(tsk, keys, enabled)				\
+	ptrauth_set_enabled_keys(tsk, keys, enabled)
+#define PAC_GET_ENABLED_KEYS(tsk) ptrauth_get_enabled_keys(tsk)
+
 #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
 /* PR_{SET,GET}_TAGGED_ADDR_CTRL prctl */
 long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg);
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index d4a5fca984c3..9dc2e88b10c6 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -565,8 +565,10 @@
 #define SCTLR_ELx_TCF_ASYNC	(UL(0x2) << SCTLR_ELx_TCF_SHIFT)
 #define SCTLR_ELx_TCF_MASK	(UL(0x3) << SCTLR_ELx_TCF_SHIFT)
 
+#define SCTLR_ELx_ENIA_SHIFT	31
+
 #define SCTLR_ELx_ITFSB	(BIT(37))
-#define SCTLR_ELx_ENIA	(BIT(31))
+#define SCTLR_ELx_ENIA	(BIT(SCTLR_ELx_ENIA_SHIFT))
 #define SCTLR_ELx_ENIB	(BIT(30))
 #define SCTLR_ELx_ENDA	(BIT(27))
 #define SCTLR_ELx_EE    (BIT(25))
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index a36e2fc330d4..8b4fbad427b3 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -43,6 +43,7 @@ int main(void)
 #endif
   BLANK();
   DEFINE(THREAD_CPU_CONTEXT,	offsetof(struct task_struct, thread.cpu_context));
+  DEFINE(THREAD_SCTLR_USER,	offsetof(struct task_struct, thread.sctlr_user));
 #ifdef CONFIG_ARM64_PTR_AUTH
   DEFINE(THREAD_KEYS_USER,	offsetof(struct task_struct, thread.keys_user));
   DEFINE(THREAD_KEYS_KERNEL,	offsetof(struct task_struct, thread.keys_kernel));
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a31a0a713c85..36ae1cdfaeb6 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -247,7 +247,24 @@ alternative_else_nop_endif
 	check_mte_async_tcf x19, x22
 	apply_ssbd 1, x22, x23
 
-	ptrauth_keys_install_kernel tsk, x20, x22, x23
+	ptrauth_keys_install_kernel_nosync tsk, x20, x22, x23
+
+#ifdef CONFIG_ARM64_PTR_AUTH
+alternative_if ARM64_HAS_ADDRESS_AUTH
+	/*
+	 * Enable IA for in-kernel PAC if the task had it disabled. Although
+	 * this could be implemented with an unconditional MRS which would avoid
+	 * a load, this was measured to be slower on Cortex-A75 and Cortex-A76.
+	 */
+	ldr	x0, [tsk, THREAD_SCTLR_USER]
+	tbnz	x0, SCTLR_ELx_ENIA_SHIFT, 1f
+	mrs	x0, sctlr_el1
+	orr	x0, x0, SCTLR_ELx_ENIA
+	msr	sctlr_el1, x0
+1:
+	isb
+alternative_else_nop_endif
+#endif
 
 	mte_set_kernel_gcr x22, x23
 
@@ -351,9 +368,27 @@ alternative_else_nop_endif
 3:
 	scs_save tsk, x0
 
-	/* No kernel C function calls after this as user keys are set. */
+	/*
+	 * No kernel C function calls after this as user keys are set and IA may
+	 * be disabled.
+	 */
 	ptrauth_keys_install_user tsk, x0, x1, x2
 
+#ifdef CONFIG_ARM64_PTR_AUTH
+alternative_if ARM64_HAS_ADDRESS_AUTH
+	/*
+	 * IA was enabled for in-kernel PAC. Disable it now if needed.
+	 * All other per-task SCTLR bits were updated on task switch.
+	 */
+	ldr	x0, [tsk, THREAD_SCTLR_USER]
+	tbnz	x0, SCTLR_ELx_ENIA_SHIFT, 1f
+	mrs	x0, sctlr_el1
+	bic	x0, x0, SCTLR_ELx_ENIA
+	msr	sctlr_el1, x0
+1:
+alternative_else_nop_endif
+#endif
+
 	mte_set_user_gcr tsk, x0, x1
 
 	apply_ssbd 0, x0, x1
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 8da597e41795..125a10e413e9 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -207,7 +207,7 @@ static void set_gcr_el1_excl(u64 excl)
 	 */
 }
 
-void flush_mte_state(void)
+void mte_thread_init_user(void)
 {
 	if (!system_supports_mte())
 		return;
diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c
index adb955fd9bdd..f03e5bfe4490 100644
--- a/arch/arm64/kernel/pointer_auth.c
+++ b/arch/arm64/kernel/pointer_auth.c
@@ -46,3 +46,65 @@ int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg)
 
 	return 0;
 }
+
+static u64 arg_to_enxx_mask(unsigned long arg)
+{
+	u64 sctlr_enxx_mask = 0;
+
+	WARN_ON(arg & ~PR_PAC_ENABLED_KEYS_MASK);
+	if (arg & PR_PAC_APIAKEY)
+		sctlr_enxx_mask |= SCTLR_ELx_ENIA;
+	if (arg & PR_PAC_APIBKEY)
+		sctlr_enxx_mask |= SCTLR_ELx_ENIB;
+	if (arg & PR_PAC_APDAKEY)
+		sctlr_enxx_mask |= SCTLR_ELx_ENDA;
+	if (arg & PR_PAC_APDBKEY)
+		sctlr_enxx_mask |= SCTLR_ELx_ENDB;
+	return sctlr_enxx_mask;
+}
+
+int ptrauth_set_enabled_keys(struct task_struct *tsk, unsigned long keys,
+			     unsigned long enabled)
+{
+	u64 sctlr = tsk->thread.sctlr_user;
+
+	if (!system_supports_address_auth())
+		return -EINVAL;
+
+	if (is_compat_thread(task_thread_info(tsk)))
+		return -EINVAL;
+
+	if ((keys & ~PR_PAC_ENABLED_KEYS_MASK) || (enabled & ~keys))
+		return -EINVAL;
+
+	sctlr &= ~arg_to_enxx_mask(keys);
+	sctlr |= arg_to_enxx_mask(enabled);
+	if (tsk == current)
+		set_task_sctlr_el1(sctlr);
+	else
+		tsk->thread.sctlr_user = sctlr;
+
+	return 0;
+}
+
+int ptrauth_get_enabled_keys(struct task_struct *tsk)
+{
+	int retval = 0;
+
+	if (!system_supports_address_auth())
+		return -EINVAL;
+
+	if (is_compat_thread(task_thread_info(tsk)))
+		return -EINVAL;
+
+	if (tsk->thread.sctlr_user & SCTLR_ELx_ENIA)
+		retval |= PR_PAC_APIAKEY;
+	if (tsk->thread.sctlr_user & SCTLR_ELx_ENIB)
+		retval |= PR_PAC_APIBKEY;
+	if (tsk->thread.sctlr_user & SCTLR_ELx_ENDA)
+		retval |= PR_PAC_APDAKEY;
+	if (tsk->thread.sctlr_user & SCTLR_ELx_ENDB)
+		retval |= PR_PAC_APDBKEY;
+
+	return retval;
+}
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index de0ab08cb260..d583cbae32f6 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -339,7 +339,6 @@ void flush_thread(void)
 	tls_thread_flush();
 	flush_ptrace_hw_breakpoint(current);
 	flush_tagged_addr_state();
-	flush_mte_state();
 }
 
 void release_thread(struct task_struct *dead_task)
@@ -531,7 +530,11 @@ static void erratum_1418040_thread_switch(struct task_struct *prev,
 
 static void update_sctlr_el1(u64 sctlr)
 {
-	sysreg_clear_set(sctlr_el1, SCTLR_USER_MASK, sctlr);
+	/*
+	 * EnIA must not be cleared while in the kernel as this is necessary for
+	 * in-kernel PAC. It will be cleared on kernel exit if needed.
+	 */
+	sysreg_clear_set(sctlr_el1, SCTLR_USER_MASK & ~SCTLR_ELx_ENIA, sctlr);
 
 	/* ISB required for the kernel uaccess routines when setting TCF0. */
 	isb();
@@ -632,7 +635,8 @@ void arch_setup_new_exec(void)
 {
 	current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
 
-	ptrauth_thread_init_user(current);
+	ptrauth_thread_init_user();
+	mte_thread_init_user();
 
 	if (task_spec_ssb_noexec(current)) {
 		arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS,
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 170f42fd6101..eb2f73939b7b 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -909,6 +909,38 @@ static int pac_mask_get(struct task_struct *target,
 	return membuf_write(&to, &uregs, sizeof(uregs));
 }
 
+static int pac_enabled_keys_get(struct task_struct *target,
+				const struct user_regset *regset,
+				struct membuf to)
+{
+	long enabled_keys = ptrauth_get_enabled_keys(target);
+
+	if (IS_ERR_VALUE(enabled_keys))
+		return enabled_keys;
+
+	return membuf_write(&to, &enabled_keys, sizeof(enabled_keys));
+}
+
+static int pac_enabled_keys_set(struct task_struct *target,
+				const struct user_regset *regset,
+				unsigned int pos, unsigned int count,
+				const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+	long enabled_keys = ptrauth_get_enabled_keys(target);
+
+	if (IS_ERR_VALUE(enabled_keys))
+		return enabled_keys;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &enabled_keys, 0,
+				 sizeof(long));
+	if (ret)
+		return ret;
+
+	return ptrauth_set_enabled_keys(target, PR_PAC_ENABLED_KEYS_MASK,
+					enabled_keys);
+}
+
 #ifdef CONFIG_CHECKPOINT_RESTORE
 static __uint128_t pac_key_to_user(const struct ptrauth_key *key)
 {
@@ -1074,6 +1106,7 @@ enum aarch64_regset {
 #endif
 #ifdef CONFIG_ARM64_PTR_AUTH
 	REGSET_PAC_MASK,
+	REGSET_PAC_ENABLED_KEYS,
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	REGSET_PACA_KEYS,
 	REGSET_PACG_KEYS,
@@ -1160,6 +1193,14 @@ static const struct user_regset aarch64_regsets[] = {
 		.regset_get = pac_mask_get,
 		/* this cannot be set dynamically */
 	},
+	[REGSET_PAC_ENABLED_KEYS] = {
+		.core_note_type = NT_ARM_PAC_ENABLED_KEYS,
+		.n = 1,
+		.size = sizeof(long),
+		.align = sizeof(long),
+		.regset_get = pac_enabled_keys_get,
+		.set = pac_enabled_keys_set,
+	},
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	[REGSET_PACA_KEYS] = {
 		.core_note_type = NT_ARM_PACA_KEYS,
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 30f68b42eeb5..61bf4774b8f2 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -426,6 +426,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_PACA_KEYS	0x407	/* ARM pointer authentication address keys */
 #define NT_ARM_PACG_KEYS	0x408	/* ARM pointer authentication generic key */
 #define NT_ARM_TAGGED_ADDR_CTRL	0x409	/* arm64 tagged address control (prctl()) */
+#define NT_ARM_PAC_ENABLED_KEYS	0x40a	/* arm64 ptr auth enabled keys (prctl()) */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 667f1aed091c..18a9f59dc067 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -255,4 +255,8 @@ struct prctl_mm_map {
 # define SYSCALL_DISPATCH_FILTER_ALLOW	0
 # define SYSCALL_DISPATCH_FILTER_BLOCK	1
 
+/* Set/get enabled arm64 pointer authentication keys */
+#define PR_PAC_SET_ENABLED_KEYS		60
+#define PR_PAC_GET_ENABLED_KEYS		61
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 2e2e3f378d97..3d62c9599dc0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -119,6 +119,12 @@
 #ifndef PAC_RESET_KEYS
 # define PAC_RESET_KEYS(a, b)	(-EINVAL)
 #endif
+#ifndef PAC_SET_ENABLED_KEYS
+# define PAC_SET_ENABLED_KEYS(a, b, c)	(-EINVAL)
+#endif
+#ifndef PAC_GET_ENABLED_KEYS
+# define PAC_GET_ENABLED_KEYS(a)	(-EINVAL)
+#endif
 #ifndef SET_TAGGED_ADDR_CTRL
 # define SET_TAGGED_ADDR_CTRL(a)	(-EINVAL)
 #endif
@@ -2497,6 +2503,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			return -EINVAL;
 		error = PAC_RESET_KEYS(me, arg2);
 		break;
+	case PR_PAC_SET_ENABLED_KEYS:
+		if (arg4 || arg5)
+			return -EINVAL;
+		error = PAC_SET_ENABLED_KEYS(me, arg2, arg3);
+		break;
+	case PR_PAC_GET_ENABLED_KEYS:
+		if (arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = PAC_GET_ENABLED_KEYS(me);
+		break;
 	case PR_SET_TAGGED_ADDR_CTRL:
 		if (arg3 || arg4 || arg5)
 			return -EINVAL;
-- 
cgit v1.2.3-71-gd317


From 441e8c66b23e027c00ccebd70df9fd933918eefe Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Tue, 13 Apr 2021 11:16:06 +0200
Subject: bpf: Return target info when a tracing bpf_link is queried
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is currently no way to discover the target of a tracing program
attachment after the fact. Add this information to bpf_link_info and return
it when querying the bpf_link fd.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210413091607.58945-1-toke@redhat.com
---
 include/linux/bpf_verifier.h   | 9 +++++++++
 include/uapi/linux/bpf.h       | 2 ++
 kernel/bpf/syscall.c           | 3 +++
 tools/include/uapi/linux/bpf.h | 2 ++
 4 files changed, 16 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 51c2ffa3d901..6023a1367853 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -487,6 +487,15 @@ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
 		return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id;
 }
 
+/* unpack the IDs from the key as constructed above */
+static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id)
+{
+	if (obj_id)
+		*obj_id = key >> 32;
+	if (btf_id)
+		*btf_id = key & 0x7FFFFFFF;
+}
+
 int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    const struct bpf_prog *prog,
 			    const struct bpf_prog *tgt_prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 85c924bc21b1..df164a44bb41 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5416,6 +5416,8 @@ struct bpf_link_info {
 		} raw_tracepoint;
 		struct {
 			__u32 attach_type;
+			__u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */
+			__u32 target_btf_id; /* BTF type id inside the object */
 		} tracing;
 		struct {
 			__u64 cgroup_id;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6428634da57e..fd495190115e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2551,6 +2551,9 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
 		container_of(link, struct bpf_tracing_link, link);
 
 	info->tracing.attach_type = tr_link->attach_type;
+	bpf_trampoline_unpack_key(tr_link->trampoline->key,
+				  &info->tracing.target_obj_id,
+				  &info->tracing.target_btf_id);
 
 	return 0;
 }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 85c924bc21b1..df164a44bb41 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5416,6 +5416,8 @@ struct bpf_link_info {
 		} raw_tracepoint;
 		struct {
 			__u32 attach_type;
+			__u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */
+			__u32 target_btf_id; /* BTF type id inside the object */
 		} tracing;
 		struct {
 			__u64 cgroup_id;
-- 
cgit v1.2.3-71-gd317


From 52a4c95f4d24b8bcb50745732f7b9f8513c49c5f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 25 Mar 2021 11:18:22 -0400
Subject: fuse: extend FUSE_SETXATTR request

Fuse client needs to send additional information to file server when it
calls SETXATTR(system.posix_acl_access), so add extra flags field to the
structure.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/acl.c             | 2 +-
 fs/fuse/fuse_i.h          | 5 ++++-
 fs/fuse/inode.c           | 4 +++-
 fs/fuse/xattr.c           | 9 ++++++---
 include/uapi/linux/fuse.h | 7 +++++++
 5 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index e9c0f916349d..d31260a139d4 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -94,7 +94,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 			return ret;
 		}
 
-		ret = fuse_setxattr(inode, name, value, size, 0);
+		ret = fuse_setxattr(inode, name, value, size, 0, 0);
 		kfree(value);
 	} else {
 		ret = fuse_removexattr(inode, name);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e7775d9c3314..979b680bb47e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -668,6 +668,9 @@ struct fuse_conn {
 	/** Is setxattr not implemented by fs? */
 	unsigned no_setxattr:1;
 
+	/** Does file server support extended setxattr */
+	unsigned setxattr_ext:1;
+
 	/** Is getxattr not implemented by fs? */
 	unsigned no_getxattr:1;
 
@@ -1171,7 +1174,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked);
 bool fuse_lock_inode(struct inode *inode);
 
 int fuse_setxattr(struct inode *inode, const char *name, const void *value,
-		  size_t size, int flags);
+		  size_t size, int flags, unsigned int extra_flags);
 ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
 		      size_t size);
 ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b0e18b470e91..06a68cfa76d8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1052,6 +1052,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 				fc->handle_killpriv_v2 = 1;
 				fm->sb->s_flags |= SB_NOSEC;
 			}
+			if (arg->flags & FUSE_SETXATTR_EXT)
+				fc->setxattr_ext = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -1095,7 +1097,7 @@ void fuse_send_init(struct fuse_mount *fm)
 		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
 		FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
 		FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
-		FUSE_HANDLE_KILLPRIV_V2;
+		FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT;
 #ifdef CONFIG_FUSE_DAX
 	if (fm->fc->dax)
 		ia->in.flags |= FUSE_MAP_ALIGNMENT;
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 1a7d7ace54e1..61dfaf7b7d20 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -12,7 +12,7 @@
 #include <linux/posix_acl_xattr.h>
 
 int fuse_setxattr(struct inode *inode, const char *name, const void *value,
-		  size_t size, int flags)
+		  size_t size, int flags, unsigned int extra_flags)
 {
 	struct fuse_mount *fm = get_fuse_mount(inode);
 	FUSE_ARGS(args);
@@ -25,10 +25,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
 	inarg.flags = flags;
+	inarg.setxattr_flags = extra_flags;
+
 	args.opcode = FUSE_SETXATTR;
 	args.nodeid = get_node_id(inode);
 	args.in_numargs = 3;
-	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].size = fm->fc->setxattr_ext ?
+		sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE;
 	args.in_args[0].value = &inarg;
 	args.in_args[1].size = strlen(name) + 1;
 	args.in_args[1].value = name;
@@ -199,7 +202,7 @@ static int fuse_xattr_set(const struct xattr_handler *handler,
 	if (!value)
 		return fuse_removexattr(inode, name);
 
-	return fuse_setxattr(inode, name, value, size, flags);
+	return fuse_setxattr(inode, name, value, size, flags, 0);
 }
 
 static bool no_xattr_list(struct dentry *dentry)
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 54442612c48b..390f7b4c889d 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -179,6 +179,7 @@
  *  7.33
  *  - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID
  *  - add FUSE_OPEN_KILL_SUIDGID
+ *  - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT
  */
 
 #ifndef _LINUX_FUSE_H
@@ -330,6 +331,7 @@ struct fuse_file_lock {
  *			does not have CAP_FSETID. Additionally upon
  *			write/truncate sgid is killed only if file has group
  *			execute permission. (Same as Linux VFS behavior).
+ * FUSE_SETXATTR_EXT:	Server supports extended struct fuse_setxattr_in
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -360,6 +362,7 @@ struct fuse_file_lock {
 #define FUSE_MAP_ALIGNMENT	(1 << 26)
 #define FUSE_SUBMOUNTS		(1 << 27)
 #define FUSE_HANDLE_KILLPRIV_V2	(1 << 28)
+#define FUSE_SETXATTR_EXT	(1 << 29)
 
 /**
  * CUSE INIT request/reply flags
@@ -681,9 +684,13 @@ struct fuse_fsync_in {
 	uint32_t	padding;
 };
 
+#define FUSE_COMPAT_SETXATTR_IN_SIZE 8
+
 struct fuse_setxattr_in {
 	uint32_t	size;
 	uint32_t	flags;
+	uint32_t	setxattr_flags;
+	uint32_t	padding;
 };
 
 struct fuse_getxattr_in {
-- 
cgit v1.2.3-71-gd317


From 550a7d3bc0c4049ef8d36ff4d9ed7082ee8cb5ec Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 25 Mar 2021 11:18:23 -0400
Subject: fuse: add a flag FUSE_SETXATTR_ACL_KILL_SGID to kill SGID

When posix access ACL is set, it can have an effect on file mode and it can
also need to clear SGID if.

- None of caller's group/supplementary groups match file owner group.
AND
- Caller is not priviliged (No CAP_FSETID).

As of now fuser server is responsible for changing the file mode as
well. But it does not know whether to clear SGID or not.

So add a flag FUSE_SETXATTR_ACL_KILL_SGID and send this info with SETXATTR
to let file server know that sgid needs to be cleared as well.

Reported-by: Luis Henriques <lhenriques@suse.de>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/acl.c             | 7 ++++++-
 include/uapi/linux/fuse.h | 7 +++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index d31260a139d4..52b165319be1 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -71,6 +71,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 		return -EINVAL;
 
 	if (acl) {
+		unsigned int extra_flags = 0;
 		/*
 		 * Fuse userspace is responsible for updating access
 		 * permissions in the inode, if needed. fuse_setxattr
@@ -94,7 +95,11 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 			return ret;
 		}
 
-		ret = fuse_setxattr(inode, name, value, size, 0, 0);
+		if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
+		    !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+			extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
+
+		ret = fuse_setxattr(inode, name, value, size, 0, extra_flags);
 		kfree(value);
 	} else {
 		ret = fuse_removexattr(inode, name);
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 390f7b4c889d..271ae90a9bb7 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -180,6 +180,7 @@
  *  - add FUSE_HANDLE_KILLPRIV_V2, FUSE_WRITE_KILL_SUIDGID, FATTR_KILL_SUIDGID
  *  - add FUSE_OPEN_KILL_SUIDGID
  *  - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT
+ *  - add FUSE_SETXATTR_ACL_KILL_SGID
  */
 
 #ifndef _LINUX_FUSE_H
@@ -454,6 +455,12 @@ struct fuse_file_lock {
  */
 #define FUSE_OPEN_KILL_SUIDGID	(1 << 0)
 
+/**
+ * setxattr flags
+ * FUSE_SETXATTR_ACL_KILL_SGID: Clear SGID when system.posix_acl_access is set
+ */
+#define FUSE_SETXATTR_ACL_KILL_SGID	(1 << 0)
+
 enum fuse_opcode {
 	FUSE_LOOKUP		= 1,
 	FUSE_FORGET		= 2,  /* no reply */
-- 
cgit v1.2.3-71-gd317


From be85dbfeb37c8c4d4344da2ee594d78034b82489 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 15 Apr 2021 15:53:15 -0700
Subject: ethtool: add FEC statistics

Similarly to pause statistics add stats for FEC.

The IEEE standard mandates two sets of counters:
 - 30.5.1.1.17 aFECCorrectedBlocks
 - 30.5.1.1.18 aFECUncorrectableBlocks
where block is a block of bits FEC operates on.
Each of these counters is defined per lane (PCS instance).

Multiple vendors provide number of corrected _bits_ rather
than/as well as blocks.

This set adds the 2 standard-based block counters and a extra
one for corrected bits.

Counters are exposed to user space via netlink in new attributes.
Each attribute carries an array of u64s, first element is
the total count, and the following ones are a per-lane break down.

Much like with pause stats the operation will not fail when driver
does not implement the get_fec_stats callback (nor can the driver
fail the operation by returning an error). If stats can't be
reported the relevant attributes will be empty.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst | 21 ++++++++
 Documentation/networking/statistics.rst      |  2 +
 include/linux/ethtool.h                      | 40 +++++++++++++++
 include/uapi/linux/ethtool_netlink.h         | 14 ++++++
 net/ethtool/fec.c                            | 73 +++++++++++++++++++++++++++-
 5 files changed, 149 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index bbecffc7b11a..f8219e2f489e 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1302,6 +1302,7 @@ Kernel response contents:
   ``ETHTOOL_A_FEC_MODES``                bitset  configured modes
   ``ETHTOOL_A_FEC_AUTO``                 bool    FEC mode auto selection
   ``ETHTOOL_A_FEC_ACTIVE``               u32     index of active FEC mode
+  ``ETHTOOL_A_FEC_STATS``                nested  FEC statistics
   =====================================  ======  ==========================
 
 ``ETHTOOL_A_FEC_ACTIVE`` is the bit index of the FEC link mode currently
@@ -1315,6 +1316,26 @@ This is equivalent to the ``ETHTOOL_FEC_AUTO`` bit of the ioctl interface.
 ``ETHTOOL_A_FEC_MODES`` carry the current FEC configuration using link mode
 bits (rather than old ``ETHTOOL_FEC_*`` bits).
 
+``ETHTOOL_A_FEC_STATS`` are reported if ``ETHTOOL_FLAG_STATS`` was set in
+``ETHTOOL_A_HEADER_FLAGS``.
+Each attribute carries an array of 64bit statistics. First entry in the array
+contains the total number of events on the port, while the following entries
+are counters corresponding to lanes/PCS instances. The number of entries in
+the array will be:
+
++--------------+---------------------------------------------+
+| `0`          | device does not support FEC statistics      |
++--------------+---------------------------------------------+
+| `1`          | device does not support per-lane break down |
++--------------+---------------------------------------------+
+| `1 + #lanes` | device has full support for FEC stats       |
++--------------+---------------------------------------------+
+
+Drivers fill in the statistics in the following structure:
+
+.. kernel-doc:: include/linux/ethtool.h
+    :identifiers: ethtool_fec_stats
+
 FEC_SET
 =======
 
diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst
index 234abedc29b2..b748fe44ee02 100644
--- a/Documentation/networking/statistics.rst
+++ b/Documentation/networking/statistics.rst
@@ -130,6 +130,7 @@ the `ETHTOOL_FLAG_STATS` flag in `ETHTOOL_A_HEADER_FLAGS`. Currently
 statistics are supported in the following commands:
 
   - `ETHTOOL_MSG_PAUSE_GET`
+  - `ETHTOOL_MSG_FEC_GET`
 
 debugfs
 -------
@@ -176,3 +177,4 @@ translated to netlink attributes when dumped. Drivers must not overwrite
 the statistics they don't report with 0.
 
 - ethtool_pause_stats()
+- ethtool_fec_stats()
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 069100b252bd..112a85b57f1f 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -269,6 +269,39 @@ struct ethtool_pause_stats {
 	u64 rx_pause_frames;
 };
 
+#define ETHTOOL_MAX_LANES	8
+
+/**
+ * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC
+ * @corrected_blocks: number of received blocks corrected by FEC
+ *	Reported to user space as %ETHTOOL_A_FEC_STAT_CORRECTED.
+ *
+ *	Equivalent to `30.5.1.1.17 aFECCorrectedBlocks` from the standard.
+ *
+ * @uncorrectable_blocks: number of received blocks FEC was not able to correct
+ *	Reported to user space as %ETHTOOL_A_FEC_STAT_UNCORR.
+ *
+ *	Equivalent to `30.5.1.1.18 aFECUncorrectableBlocks` from the standard.
+ *
+ * @corrected_bits: number of bits corrected by FEC
+ *	Similar to @corrected_blocks but counts individual bit changes,
+ *	not entire FEC data blocks. This is a non-standard statistic.
+ *	Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS.
+ *
+ * @lane: per-lane/PCS-instance counts as defined by the standard
+ * @total: error counts for the entire port, for drivers incapable of reporting
+ *	per-lane stats
+ *
+ * Drivers should fill in either only total or per-lane statistics, core
+ * will take care of adding lane values up to produce the total.
+ */
+struct ethtool_fec_stats {
+	struct ethtool_fec_stat {
+		u64 total;
+		u64 lanes[ETHTOOL_MAX_LANES];
+	} corrected_blocks, uncorrectable_blocks, corrected_bits;
+};
+
 #define ETH_MODULE_EEPROM_PAGE_LEN	128
 #define ETH_MODULE_MAX_I2C_ADDRESS	0x7f
 
@@ -439,6 +472,11 @@ struct ethtool_module_eeprom {
  *	ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter),
  *	any change to them will be overwritten by kernel. Returns a negative
  *	error code or zero.
+ * @get_fec_stats: Report FEC statistics.
+ *	Core will sum up per-lane stats to get the total.
+ *	Drivers must not zero statistics which they don't report. The stats
+ *	structure is initialized to ETHTOOL_STAT_NOT_SET indicating driver does
+ *	not report statistics.
  * @get_fecparam: Get the network device Forward Error Correction parameters.
  * @set_fecparam: Set the network device Forward Error Correction parameters.
  * @get_ethtool_phy_stats: Return extended statistics about the PHY device.
@@ -544,6 +582,8 @@ struct ethtool_ops {
 				      struct ethtool_link_ksettings *);
 	int	(*set_link_ksettings)(struct net_device *,
 				      const struct ethtool_link_ksettings *);
+	void	(*get_fec_stats)(struct net_device *dev,
+				 struct ethtool_fec_stats *fec_stats);
 	int	(*get_fecparam)(struct net_device *,
 				      struct ethtool_fecparam *);
 	int	(*set_fecparam)(struct net_device *,
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 9612dcd48a6a..3a2b31ccbc5b 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -643,11 +643,25 @@ enum {
 	ETHTOOL_A_FEC_MODES,				/* bitset */
 	ETHTOOL_A_FEC_AUTO,				/* u8 */
 	ETHTOOL_A_FEC_ACTIVE,				/* u32 */
+	ETHTOOL_A_FEC_STATS,				/* nest - _A_FEC_STAT */
 
 	__ETHTOOL_A_FEC_CNT,
 	ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1)
 };
 
+enum {
+	ETHTOOL_A_FEC_STAT_UNSPEC,
+	ETHTOOL_A_FEC_STAT_PAD,
+
+	ETHTOOL_A_FEC_STAT_CORRECTED,			/* array, u64 */
+	ETHTOOL_A_FEC_STAT_UNCORR,			/* array, u64 */
+	ETHTOOL_A_FEC_STAT_CORR_BITS,			/* array, u64 */
+
+	/* add new constants above here */
+	__ETHTOOL_A_FEC_STAT_CNT,
+	ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1)
+};
+
 /* MODULE EEPROM */
 
 enum {
diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c
index 3e7d091ee7aa..8738dafd5417 100644
--- a/net/ethtool/fec.c
+++ b/net/ethtool/fec.c
@@ -13,6 +13,10 @@ struct fec_reply_data {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes);
 	u32 active_fec;
 	u8 fec_auto;
+	struct fec_stat_grp {
+		u64 stats[1 + ETHTOOL_MAX_LANES];
+		u8 cnt;
+	} corr, uncorr, corr_bits;
 };
 
 #define FEC_REPDATA(__reply_base) \
@@ -21,7 +25,7 @@ struct fec_reply_data {
 #define ETHTOOL_FEC_MASK	((ETHTOOL_FEC_LLRS << 1) - 1)
 
 const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = {
-	[ETHTOOL_A_FEC_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_FEC_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy_stats),
 };
 
 static void
@@ -64,6 +68,28 @@ ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec,
 	return 0;
 }
 
+static void
+fec_stats_recalc(struct fec_stat_grp *grp, struct ethtool_fec_stat *stats)
+{
+	int i;
+
+	if (stats->lanes[0] == ETHTOOL_STAT_NOT_SET) {
+		grp->stats[0] = stats->total;
+		grp->cnt = stats->total != ETHTOOL_STAT_NOT_SET;
+		return;
+	}
+
+	grp->cnt = 1;
+	grp->stats[0] = 0;
+	for (i = 0; i < ETHTOOL_MAX_LANES; i++) {
+		if (stats->lanes[i] == ETHTOOL_STAT_NOT_SET)
+			break;
+
+		grp->stats[0] += stats->lanes[i];
+		grp->stats[grp->cnt++] = stats->lanes[i];
+	}
+}
+
 static int fec_prepare_data(const struct ethnl_req_info *req_base,
 			    struct ethnl_reply_data *reply_base,
 			    struct genl_info *info)
@@ -82,6 +108,17 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base,
 	ret = dev->ethtool_ops->get_fecparam(dev, &fec);
 	if (ret)
 		goto out_complete;
+	if (req_base->flags & ETHTOOL_FLAG_STATS &&
+	    dev->ethtool_ops->get_fec_stats) {
+		struct ethtool_fec_stats stats;
+
+		ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8);
+		dev->ethtool_ops->get_fec_stats(dev, &stats);
+
+		fec_stats_recalc(&data->corr, &stats.corrected_blocks);
+		fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks);
+		fec_stats_recalc(&data->corr_bits, &stats.corrected_bits);
+	}
 
 	WARN_ON_ONCE(fec.reserved);
 
@@ -120,9 +157,40 @@ static int fec_reply_size(const struct ethnl_req_info *req_base,
 	len += nla_total_size(sizeof(u8)) +	/* _FEC_AUTO */
 	       nla_total_size(sizeof(u32));	/* _FEC_ACTIVE */
 
+	if (req_base->flags & ETHTOOL_FLAG_STATS)
+		len += 3 * nla_total_size_64bit(sizeof(u64) *
+						(1 + ETHTOOL_MAX_LANES));
+
 	return len;
 }
 
+static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, ETHTOOL_A_FEC_STATS);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORRECTED,
+			  sizeof(u64) * data->corr.cnt,
+			  data->corr.stats, ETHTOOL_A_FEC_STAT_PAD) ||
+	    nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_UNCORR,
+			  sizeof(u64) * data->uncorr.cnt,
+			  data->uncorr.stats, ETHTOOL_A_FEC_STAT_PAD) ||
+	    nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORR_BITS,
+			  sizeof(u64) * data->corr_bits.cnt,
+			  data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD))
+		goto err_cancel;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+err_cancel:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static int fec_fill_reply(struct sk_buff *skb,
 			  const struct ethnl_req_info *req_base,
 			  const struct ethnl_reply_data *reply_base)
@@ -143,6 +211,9 @@ static int fec_fill_reply(struct sk_buff *skb,
 	     nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec)))
 		return -EMSGSIZE;
 
+	if (req_base->flags & ETHTOOL_FLAG_STATS && fec_put_stats(skb, data))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From 2b26f0aa004995f49f7b6f4100dd0e4c39a9ed5f Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 8 Apr 2021 12:35:58 +0200
Subject: perf: Support only inheriting events if cloned with CLONE_THREAD

Adds bit perf_event_attr::inherit_thread, to restricting inheriting
events only if the child was cloned with CLONE_THREAD.

This option supports the case where an event is supposed to be
process-wide only (including subthreads), but should not propagate
beyond the current process's shared environment.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/YBvj6eJR%2FDY2TsEB@hirez.programming.kicks-ass.net/
---
 include/linux/perf_event.h      |  5 +++--
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c            | 21 ++++++++++++++-------
 kernel/fork.c                   |  2 +-
 4 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3d478abf411c..1660039199b2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -958,7 +958,7 @@ extern void __perf_event_task_sched_in(struct task_struct *prev,
 				       struct task_struct *task);
 extern void __perf_event_task_sched_out(struct task_struct *prev,
 					struct task_struct *next);
-extern int perf_event_init_task(struct task_struct *child);
+extern int perf_event_init_task(struct task_struct *child, u64 clone_flags);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
@@ -1449,7 +1449,8 @@ perf_event_task_sched_in(struct task_struct *prev,
 static inline void
 perf_event_task_sched_out(struct task_struct *prev,
 			  struct task_struct *next)			{ }
-static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
+static inline int perf_event_init_task(struct task_struct *child,
+				       u64 clone_flags)			{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ad15e40d7f5d..813efb65fea8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -389,7 +389,8 @@ struct perf_event_attr {
 				cgroup         :  1, /* include cgroup events */
 				text_poke      :  1, /* include text poke events */
 				build_id       :  1, /* use build id in mmap2 events */
-				__reserved_1   : 29;
+				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
+				__reserved_1   : 28;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 10ed2cd434dc..3e3c00fd0b2e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11653,6 +11653,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	    (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
 		return -EINVAL;
 
+	if (!attr->inherit && attr->inherit_thread)
+		return -EINVAL;
+
 out:
 	return ret;
 
@@ -12873,12 +12876,13 @@ static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		   struct perf_event_context *parent_ctx,
 		   struct task_struct *child, int ctxn,
-		   int *inherited_all)
+		   u64 clone_flags, int *inherited_all)
 {
 	int ret;
 	struct perf_event_context *child_ctx;
 
-	if (!event->attr.inherit) {
+	if (!event->attr.inherit ||
+	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) {
 		*inherited_all = 0;
 		return 0;
 	}
@@ -12910,7 +12914,8 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 /*
  * Initialize the perf_event context in task_struct
  */
-static int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, int ctxn,
+				   u64 clone_flags)
 {
 	struct perf_event_context *child_ctx, *parent_ctx;
 	struct perf_event_context *cloned_ctx;
@@ -12950,7 +12955,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	 */
 	perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
 		ret = inherit_task_group(event, parent, parent_ctx,
-					 child, ctxn, &inherited_all);
+					 child, ctxn, clone_flags,
+					 &inherited_all);
 		if (ret)
 			goto out_unlock;
 	}
@@ -12966,7 +12972,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 
 	perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
 		ret = inherit_task_group(event, parent, parent_ctx,
-					 child, ctxn, &inherited_all);
+					 child, ctxn, clone_flags,
+					 &inherited_all);
 		if (ret)
 			goto out_unlock;
 	}
@@ -13008,7 +13015,7 @@ out_unlock:
 /*
  * Initialize the perf_event context in task_struct
  */
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_task(struct task_struct *child, u64 clone_flags)
 {
 	int ctxn, ret;
 
@@ -13017,7 +13024,7 @@ int perf_event_init_task(struct task_struct *child)
 	INIT_LIST_HEAD(&child->perf_event_list);
 
 	for_each_task_context_nr(ctxn) {
-		ret = perf_event_init_context(child, ctxn);
+		ret = perf_event_init_context(child, ctxn, clone_flags);
 		if (ret) {
 			perf_event_free_task(child);
 			return ret;
diff --git a/kernel/fork.c b/kernel/fork.c
index 0acc8ed1076b..3728a645771c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2078,7 +2078,7 @@ static __latent_entropy struct task_struct *copy_process(
 	if (retval)
 		goto bad_fork_cleanup_policy;
 
-	retval = perf_event_init_task(p);
+	retval = perf_event_init_task(p, clone_flags);
 	if (retval)
 		goto bad_fork_cleanup_policy;
 	retval = audit_alloc(p);
-- 
cgit v1.2.3-71-gd317


From 2e498d0a74e5b88a6689ae1b811f247f91ff188e Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 8 Apr 2021 12:35:59 +0200
Subject: perf: Add support for event removal on exec

Adds bit perf_event_attr::remove_on_exec, to support removing an event
from a task on exec.

This option supports the case where an event is supposed to be
process-wide only, and should not propagate beyond exec, to limit
monitoring to the original process image only.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210408103605.1676875-5-elver@google.com
---
 include/uapi/linux/perf_event.h |  3 +-
 kernel/events/core.c            | 70 ++++++++++++++++++++++++++++++++++++-----
 2 files changed, 64 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 813efb65fea8..8c5b9f5ad63f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -390,7 +390,8 @@ struct perf_event_attr {
 				text_poke      :  1, /* include text poke events */
 				build_id       :  1, /* use build id in mmap2 events */
 				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
-				__reserved_1   : 28;
+				remove_on_exec :  1, /* event is removed from task on exec */
+				__reserved_1   : 27;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e3c00fd0b2e..e4a584bceb7a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4248,6 +4248,57 @@ out:
 		put_ctx(clone_ctx);
 }
 
+static void perf_remove_from_owner(struct perf_event *event);
+static void perf_event_exit_event(struct perf_event *event,
+				  struct perf_event_context *ctx);
+
+/*
+ * Removes all events from the current task that have been marked
+ * remove-on-exec, and feeds their values back to parent events.
+ */
+static void perf_event_remove_on_exec(int ctxn)
+{
+	struct perf_event_context *ctx, *clone_ctx = NULL;
+	struct perf_event *event, *next;
+	LIST_HEAD(free_list);
+	unsigned long flags;
+	bool modified = false;
+
+	ctx = perf_pin_task_context(current, ctxn);
+	if (!ctx)
+		return;
+
+	mutex_lock(&ctx->mutex);
+
+	if (WARN_ON_ONCE(ctx->task != current))
+		goto unlock;
+
+	list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
+		if (!event->attr.remove_on_exec)
+			continue;
+
+		if (!is_kernel_event(event))
+			perf_remove_from_owner(event);
+
+		modified = true;
+
+		perf_event_exit_event(event, ctx);
+	}
+
+	raw_spin_lock_irqsave(&ctx->lock, flags);
+	if (modified)
+		clone_ctx = unclone_ctx(ctx);
+	--ctx->pin_count;
+	raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+unlock:
+	mutex_unlock(&ctx->mutex);
+
+	put_ctx(ctx);
+	if (clone_ctx)
+		put_ctx(clone_ctx);
+}
+
 struct perf_read_data {
 	struct perf_event *event;
 	bool group;
@@ -7560,18 +7611,18 @@ void perf_event_exec(void)
 	struct perf_event_context *ctx;
 	int ctxn;
 
-	rcu_read_lock();
 	for_each_task_context_nr(ctxn) {
-		ctx = current->perf_event_ctxp[ctxn];
-		if (!ctx)
-			continue;
-
 		perf_event_enable_on_exec(ctxn);
+		perf_event_remove_on_exec(ctxn);
 
-		perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
-				   true);
+		rcu_read_lock();
+		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		if (ctx) {
+			perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
+					 NULL, true);
+		}
+		rcu_read_unlock();
 	}
-	rcu_read_unlock();
 }
 
 struct remote_output {
@@ -11656,6 +11707,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	if (!attr->inherit && attr->inherit_thread)
 		return -EINVAL;
 
+	if (attr->remove_on_exec && attr->enable_on_exec)
+		return -EINVAL;
+
 out:
 	return ret;
 
-- 
cgit v1.2.3-71-gd317


From fb6cc127e0b6e629252cdd0f77d5a1f49db95b92 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 8 Apr 2021 12:36:00 +0200
Subject: signal: Introduce TRAP_PERF si_code and si_perf to siginfo

Introduces the TRAP_PERF si_code, and associated siginfo_t field
si_perf. These will be used by the perf event subsystem to send signals
(if requested) to the task where an event occurred.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k
Acked-by: Arnd Bergmann <arnd@arndb.de> # asm-generic
Link: https://lkml.kernel.org/r/20210408103605.1676875-6-elver@google.com
---
 arch/m68k/kernel/signal.c          |  3 +++
 arch/x86/kernel/signal_compat.c    |  5 ++++-
 fs/signalfd.c                      |  4 ++++
 include/linux/compat.h             |  2 ++
 include/linux/signal.h             |  1 +
 include/uapi/asm-generic/siginfo.h |  6 +++++-
 include/uapi/linux/signalfd.h      |  4 +++-
 kernel/signal.c                    | 11 +++++++++++
 8 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 349570f16a78..a4b7ee1df211 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -622,6 +622,9 @@ static inline void siginfo_build_tests(void)
 	/* _sigfault._addr_pkey */
 	BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12);
 
+	/* _sigfault._perf */
+	BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x10);
+
 	/* _sigpoll */
 	BUILD_BUG_ON(offsetof(siginfo_t, si_band)   != 0x0c);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_fd)     != 0x10);
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a5330ff498f0..0e5d0a7e203b 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
 	BUILD_BUG_ON(NSIGFPE  != 15);
 	BUILD_BUG_ON(NSIGSEGV != 9);
 	BUILD_BUG_ON(NSIGBUS  != 5);
-	BUILD_BUG_ON(NSIGTRAP != 5);
+	BUILD_BUG_ON(NSIGTRAP != 6);
 	BUILD_BUG_ON(NSIGCHLD != 6);
 	BUILD_BUG_ON(NSIGSYS  != 2);
 
@@ -138,6 +138,9 @@ static inline void signal_compat_build_tests(void)
 	BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
 	BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
 
+	BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18);
+	BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10);
+
 	CHECK_CSI_OFFSET(_sigpoll);
 	CHECK_CSI_SIZE  (_sigpoll, 2*sizeof(int));
 	CHECK_SI_SIZE   (_sigpoll, 4*sizeof(int));
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 456046e15873..040a1142915f 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -134,6 +134,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 #endif
 		new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
 		break;
+	case SIL_PERF_EVENT:
+		new.ssi_addr = (long) kinfo->si_addr;
+		new.ssi_perf = kinfo->si_perf;
+		break;
 	case SIL_CHLD:
 		new.ssi_pid    = kinfo->si_pid;
 		new.ssi_uid    = kinfo->si_uid;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 6e65be753603..c8821d966812 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -236,6 +236,8 @@ typedef struct compat_siginfo {
 					char _dummy_pkey[__COMPAT_ADDR_BND_PKEY_PAD];
 					u32 _pkey;
 				} _addr_pkey;
+				/* used when si_code=TRAP_PERF */
+				compat_u64 _perf;
 			};
 		} _sigfault;
 
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 205526c4003a..1e98548d7cf6 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -43,6 +43,7 @@ enum siginfo_layout {
 	SIL_FAULT_MCEERR,
 	SIL_FAULT_BNDERR,
 	SIL_FAULT_PKUERR,
+	SIL_PERF_EVENT,
 	SIL_CHLD,
 	SIL_RT,
 	SIL_SYS,
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index d2597000407a..d0bb9125c853 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -91,6 +91,8 @@ union __sifields {
 				char _dummy_pkey[__ADDR_BND_PKEY_PAD];
 				__u32 _pkey;
 			} _addr_pkey;
+			/* used when si_code=TRAP_PERF */
+			__u64 _perf;
 		};
 	} _sigfault;
 
@@ -155,6 +157,7 @@ typedef struct siginfo {
 #define si_lower	_sifields._sigfault._addr_bnd._lower
 #define si_upper	_sifields._sigfault._addr_bnd._upper
 #define si_pkey		_sifields._sigfault._addr_pkey._pkey
+#define si_perf		_sifields._sigfault._perf
 #define si_band		_sifields._sigpoll._band
 #define si_fd		_sifields._sigpoll._fd
 #define si_call_addr	_sifields._sigsys._call_addr
@@ -253,7 +256,8 @@ typedef struct siginfo {
 #define TRAP_BRANCH     3	/* process taken branch trap */
 #define TRAP_HWBKPT     4	/* hardware breakpoint/watchpoint */
 #define TRAP_UNK	5	/* undiagnosed trap */
-#define NSIGTRAP	5
+#define TRAP_PERF	6	/* perf event with sigtrap=1 */
+#define NSIGTRAP	6
 
 /*
  * There is an additional set of SIGTRAP si_codes used by ptrace
diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h
index 83429a05b698..7e333042c7e3 100644
--- a/include/uapi/linux/signalfd.h
+++ b/include/uapi/linux/signalfd.h
@@ -39,6 +39,8 @@ struct signalfd_siginfo {
 	__s32 ssi_syscall;
 	__u64 ssi_call_addr;
 	__u32 ssi_arch;
+	__u32 __pad3;
+	__u64 ssi_perf;
 
 	/*
 	 * Pad strcture to 128 bytes. Remember to update the
@@ -49,7 +51,7 @@ struct signalfd_siginfo {
 	 * comes out of a read(2) and we really don't want to have
 	 * a compat on read(2).
 	 */
-	__u8 __pad[28];
+	__u8 __pad[16];
 };
 
 
diff --git a/kernel/signal.c b/kernel/signal.c
index ba4d1ef39a9e..f68351825e5e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1199,6 +1199,7 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
 	case SIL_FAULT_MCEERR:
 	case SIL_FAULT_BNDERR:
 	case SIL_FAULT_PKUERR:
+	case SIL_PERF_EVENT:
 	case SIL_SYS:
 		ret = false;
 		break;
@@ -2531,6 +2532,7 @@ static void hide_si_addr_tag_bits(struct ksignal *ksig)
 	case SIL_FAULT_MCEERR:
 	case SIL_FAULT_BNDERR:
 	case SIL_FAULT_PKUERR:
+	case SIL_PERF_EVENT:
 		ksig->info.si_addr = arch_untagged_si_addr(
 			ksig->info.si_addr, ksig->sig, ksig->info.si_code);
 		break;
@@ -3333,6 +3335,10 @@ void copy_siginfo_to_external32(struct compat_siginfo *to,
 #endif
 		to->si_pkey = from->si_pkey;
 		break;
+	case SIL_PERF_EVENT:
+		to->si_addr = ptr_to_compat(from->si_addr);
+		to->si_perf = from->si_perf;
+		break;
 	case SIL_CHLD:
 		to->si_pid = from->si_pid;
 		to->si_uid = from->si_uid;
@@ -3413,6 +3419,10 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
 #endif
 		to->si_pkey = from->si_pkey;
 		break;
+	case SIL_PERF_EVENT:
+		to->si_addr = compat_ptr(from->si_addr);
+		to->si_perf = from->si_perf;
+		break;
 	case SIL_CHLD:
 		to->si_pid    = from->si_pid;
 		to->si_uid    = from->si_uid;
@@ -4593,6 +4603,7 @@ static inline void siginfo_buildtime_checks(void)
 	CHECK_OFFSET(si_lower);
 	CHECK_OFFSET(si_upper);
 	CHECK_OFFSET(si_pkey);
+	CHECK_OFFSET(si_perf);
 
 	/* sigpoll */
 	CHECK_OFFSET(si_band);
-- 
cgit v1.2.3-71-gd317


From 97ba62b278674293762c3d91f724f1bb922f04e0 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 8 Apr 2021 12:36:01 +0200
Subject: perf: Add support for SIGTRAP on perf events

Adds bit perf_event_attr::sigtrap, which can be set to cause events to
send SIGTRAP (with si_code TRAP_PERF) to the task where the event
occurred. The primary motivation is to support synchronous signals on
perf events in the task where an event (such as breakpoints) triggered.

To distinguish perf events based on the event type, the type is set in
si_errno. For events that are associated with an address, si_addr is
copied from perf_sample_data.

The new field perf_event_attr::sig_data is copied to si_perf, which
allows user space to disambiguate which event (of the same type)
triggered the signal. For example, user space could encode the relevant
information it cares about in sig_data.

We note that the choice of an opaque u64 provides the simplest and most
flexible option. Alternatives where a reference to some user space data
is passed back suffer from the problem that modification of referenced
data (be it the event fd, or the perf_event_attr) can race with the
signal being delivered (of course, the same caveat applies if user space
decides to store a pointer in sig_data, but the ABI explicitly avoids
prescribing such a design).

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Link: https://lore.kernel.org/lkml/YBv3rAT566k+6zjg@hirez.programming.kicks-ass.net/
---
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h | 10 ++++++++-
 kernel/events/core.c            | 49 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1660039199b2..7d7280aa4e22 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -735,6 +735,7 @@ struct perf_event {
 	int				pending_wakeup;
 	int				pending_kill;
 	int				pending_disable;
+	unsigned long			pending_addr;	/* SIGTRAP */
 	struct irq_work			pending;
 
 	atomic_t			event_limit;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 8c5b9f5ad63f..31b00e3b69c9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -311,6 +311,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
 #define PERF_ATTR_SIZE_VER6	120	/* add: aux_sample_size */
+#define PERF_ATTR_SIZE_VER7	128	/* add: sig_data */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -391,7 +392,8 @@ struct perf_event_attr {
 				build_id       :  1, /* use build id in mmap2 events */
 				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
 				remove_on_exec :  1, /* event is removed from task on exec */
-				__reserved_1   : 27;
+				sigtrap        :  1, /* send synchronous SIGTRAP on event */
+				__reserved_1   : 26;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -443,6 +445,12 @@ struct perf_event_attr {
 	__u16	__reserved_2;
 	__u32	aux_sample_size;
 	__u32	__reserved_3;
+
+	/*
+	 * User provided data if sigtrap=1, passed back to user via
+	 * siginfo_t::si_perf, e.g. to permit user to identify the event.
+	 */
+	__u64	sig_data;
 };
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e4a584bceb7a..6f0723c711a9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6392,6 +6392,33 @@ void perf_event_wakeup(struct perf_event *event)
 	}
 }
 
+static void perf_sigtrap(struct perf_event *event)
+{
+	struct kernel_siginfo info;
+
+	/*
+	 * We'd expect this to only occur if the irq_work is delayed and either
+	 * ctx->task or current has changed in the meantime. This can be the
+	 * case on architectures that do not implement arch_irq_work_raise().
+	 */
+	if (WARN_ON_ONCE(event->ctx->task != current))
+		return;
+
+	/*
+	 * perf_pending_event() can race with the task exiting.
+	 */
+	if (current->flags & PF_EXITING)
+		return;
+
+	clear_siginfo(&info);
+	info.si_signo = SIGTRAP;
+	info.si_code = TRAP_PERF;
+	info.si_errno = event->attr.type;
+	info.si_perf = event->attr.sig_data;
+	info.si_addr = (void __user *)event->pending_addr;
+	force_sig_info(&info);
+}
+
 static void perf_pending_event_disable(struct perf_event *event)
 {
 	int cpu = READ_ONCE(event->pending_disable);
@@ -6401,6 +6428,13 @@ static void perf_pending_event_disable(struct perf_event *event)
 
 	if (cpu == smp_processor_id()) {
 		WRITE_ONCE(event->pending_disable, -1);
+
+		if (event->attr.sigtrap) {
+			perf_sigtrap(event);
+			atomic_set_release(&event->event_limit, 1); /* rearm event */
+			return;
+		}
+
 		perf_event_disable_local(event);
 		return;
 	}
@@ -9103,6 +9137,7 @@ static int __perf_event_overflow(struct perf_event *event,
 	if (events && atomic_dec_and_test(&event->event_limit)) {
 		ret = 1;
 		event->pending_kill = POLL_HUP;
+		event->pending_addr = data->addr;
 
 		perf_event_disable_inatomic(event);
 	}
@@ -11384,6 +11419,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		if (!task || cpu != -1)
 			return ERR_PTR(-EINVAL);
 	}
+	if (attr->sigtrap && !task) {
+		/* Requires a task: avoid signalling random tasks. */
+		return ERR_PTR(-EINVAL);
+	}
 
 	node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
 	event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
@@ -11432,6 +11471,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
 
+	if (event->attr.sigtrap)
+		atomic_set(&event->event_limit, 1);
+
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
 		/*
@@ -11710,6 +11752,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	if (attr->remove_on_exec && attr->enable_on_exec)
 		return -EINVAL;
 
+	if (attr->sigtrap && !attr->remove_on_exec)
+		return -EINVAL;
+
 out:
 	return ret;
 
@@ -12936,7 +12981,9 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 	struct perf_event_context *child_ctx;
 
 	if (!event->attr.inherit ||
-	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) {
+	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
+	    /* Do not inherit if sigtrap and signal handlers were cleared. */
+	    (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
 		*inherited_all = 0;
 		return 0;
 	}
-- 
cgit v1.2.3-71-gd317


From d0d1dd628527c77db2391ce0293c1ed344b2365f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 10 Feb 2021 17:33:26 +0900
Subject: perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

This patch adds a new software event to count context switches
involving cgroup switches.  So it's counted only if cgroups of
previous and next tasks are different.  Note that it only checks the
cgroups in the perf_event subsystem.  For cgroup v2, it shouldn't
matter anyway.

One can argue that we can do this by using existing sched_switch event
with eBPF.  But some systems might not have eBPF for some reason so
I'd like to add this as a simple way.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210210083327.22726-2-namhyung@kernel.org
---
 include/linux/perf_event.h      | 7 +++++++
 include/uapi/linux/perf_event.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 92d51a7beaea..8989b2b7268d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1218,6 +1218,13 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 	if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
 		__perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
 
+#ifdef CONFIG_CGROUP_PERF
+	if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
+	    perf_cgroup_from_task(prev, NULL) !=
+	    perf_cgroup_from_task(next, NULL))
+		__perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
+#endif
+
 	if (static_branch_unlikely(&perf_sched_events))
 		__perf_event_task_sched_out(prev, next);
 }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 31b00e3b69c9..0b58970bab6b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -112,6 +112,7 @@ enum perf_sw_ids {
 	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
 	PERF_COUNT_SW_DUMMY			= 9,
 	PERF_COUNT_SW_BPF_OUTPUT		= 10,
+	PERF_COUNT_SW_CGROUP_SWITCHES		= 11,
 
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
-- 
cgit v1.2.3-71-gd317


From f09ea6fb12723d6726293d68de00b6307368bd76 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 16 Apr 2021 12:27:39 -0700
Subject: ethtool: add a new command for reading standard stats

Add an interface for reading standard stats, including
stats which don't have a corresponding control interface.

Start with IEEE 802.3 PHY stats. There seems to be only
one stat to expose there.

Define API to not require user space changes when new
stats or groups are added. Groups are based on bitset,
stats have a string set associated.

v1: wrap stats in a nest

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h              |  10 ++
 include/uapi/linux/ethtool.h         |   4 +
 include/uapi/linux/ethtool_netlink.h |  47 ++++++++
 net/ethtool/Makefile                 |   2 +-
 net/ethtool/netlink.c                |  10 ++
 net/ethtool/netlink.h                |   5 +
 net/ethtool/stats.c                  | 200 +++++++++++++++++++++++++++++++++++
 net/ethtool/strset.c                 |  10 ++
 8 files changed, 287 insertions(+), 1 deletion(-)
 create mode 100644 net/ethtool/stats.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 112a85b57f1f..2d5455eedbf4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -250,6 +250,13 @@ static inline void ethtool_stats_init(u64 *stats, unsigned int n)
 		stats[n] = ETHTOOL_STAT_NOT_SET;
 }
 
+/* Basic IEEE 802.3 PHY statistics (30.3.2.1.*), not otherwise exposed
+ * via a more targeted API.
+ */
+struct ethtool_eth_phy_stats {
+	u64 SymbolErrorDuringCarrier;
+};
+
 /**
  * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames
  * @tx_pause_frames: transmitted pause frame count. Reported to user space
@@ -487,6 +494,7 @@ struct ethtool_module_eeprom {
  * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from
  *	specified page. Returns a negative error code or the amount of bytes
  *	read.
+ * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -597,6 +605,8 @@ struct ethtool_ops {
 	int	(*get_module_eeprom_by_page)(struct net_device *dev,
 					     const struct ethtool_module_eeprom *page,
 					     struct netlink_ext_ack *extack);
+	void	(*get_eth_phy_stats)(struct net_device *dev,
+				     struct ethtool_eth_phy_stats *phy_stats);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f91e079e3108..190ae6e03918 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -669,6 +669,8 @@ enum ethtool_link_ext_substate_cable_issue {
  * @ETH_SS_TS_TX_TYPES: timestamping Tx types
  * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters
  * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types
+ * @ETH_SS_STATS_STD: standardized stats
+ * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics
  *
  * @ETH_SS_COUNT: number of defined string sets
  */
@@ -689,6 +691,8 @@ enum ethtool_stringset {
 	ETH_SS_TS_TX_TYPES,
 	ETH_SS_TS_RX_FILTERS,
 	ETH_SS_UDP_TUNNEL_TYPES,
+	ETH_SS_STATS_STD,
+	ETH_SS_STATS_ETH_PHY,
 
 	/* add new constants above here */
 	ETH_SS_COUNT
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 3a2b31ccbc5b..a54cfe625f34 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -45,6 +45,7 @@ enum {
 	ETHTOOL_MSG_FEC_GET,
 	ETHTOOL_MSG_FEC_SET,
 	ETHTOOL_MSG_MODULE_EEPROM_GET,
+	ETHTOOL_MSG_STATS_GET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -86,6 +87,7 @@ enum {
 	ETHTOOL_MSG_FEC_GET_REPLY,
 	ETHTOOL_MSG_FEC_NTF,
 	ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
+	ETHTOOL_MSG_STATS_GET_REPLY,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -679,6 +681,51 @@ enum {
 	ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1)
 };
 
+/* STATS */
+
+enum {
+	ETHTOOL_A_STATS_UNSPEC,
+	ETHTOOL_A_STATS_PAD,
+	ETHTOOL_A_STATS_HEADER,			/* nest - _A_HEADER_* */
+	ETHTOOL_A_STATS_GROUPS,			/* bitset */
+
+	ETHTOOL_A_STATS_GRP,			/* nest - _A_STATS_GRP_* */
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_CNT,
+	ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1)
+};
+
+enum {
+	ETHTOOL_STATS_ETH_PHY,
+
+	/* add new constants above here */
+	__ETHTOOL_STATS_CNT
+};
+
+enum {
+	ETHTOOL_A_STATS_GRP_UNSPEC,
+	ETHTOOL_A_STATS_GRP_PAD,
+
+	ETHTOOL_A_STATS_GRP_ID,			/* u32 */
+	ETHTOOL_A_STATS_GRP_SS_ID,		/* u32 */
+
+	ETHTOOL_A_STATS_GRP_STAT,		/* nest */
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_GRP_CNT,
+	ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_CNT - 1)
+};
+
+enum {
+	/* 30.3.2.1.5 aSymbolErrorDuringCarrier */
+	ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR,
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_ETH_PHY_CNT,
+	ETHTOOL_A_STATS_ETH_PHY_MAX = (__ETHTOOL_A_STATS_ETH_PHY_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 83842685fd8c..723c9a8a8cdf 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-		   tunnels.o fec.o eeprom.o
+		   tunnels.o fec.o eeprom.o stats.o
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 5f5d7c4b3d4a..290012d0d11d 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -247,6 +247,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_FEC_GET]		= &ethnl_fec_request_ops,
 	[ETHTOOL_MSG_TSINFO_GET]	= &ethnl_tsinfo_request_ops,
 	[ETHTOOL_MSG_MODULE_EEPROM_GET]	= &ethnl_module_eeprom_request_ops,
+	[ETHTOOL_MSG_STATS_GET]		= &ethnl_stats_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -942,6 +943,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_module_eeprom_get_policy,
 		.maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_STATS_GET,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_default_start,
+		.dumpit	= ethnl_default_dumpit,
+		.done	= ethnl_default_done,
+		.policy = ethnl_stats_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 4305ac971bb0..9d88983b6597 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -346,6 +346,7 @@ extern const struct ethnl_request_ops ethnl_eee_request_ops;
 extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
 extern const struct ethnl_request_ops ethnl_fec_request_ops;
 extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
+extern const struct ethnl_request_ops ethnl_stats_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -380,6 +381,7 @@ extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INF
 extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
 extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
 extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1];
+extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
@@ -399,4 +401,7 @@ int ethnl_tunnel_info_start(struct netlink_callback *cb);
 int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info);
 
+extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
+
 #endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
new file mode 100644
index 000000000000..fd8f47178c06
--- /dev/null
+++ b/net/ethtool/stats.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct stats_req_info {
+	struct ethnl_req_info		base;
+	DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT);
+};
+
+#define STATS_REQINFO(__req_base) \
+	container_of(__req_base, struct stats_req_info, base)
+
+struct stats_reply_data {
+	struct ethnl_reply_data		base;
+	struct ethtool_eth_phy_stats	phy_stats;
+};
+
+#define STATS_REPDATA(__reply_base) \
+	container_of(__reply_base, struct stats_reply_data, base)
+
+const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_STATS_ETH_PHY]			= "eth-phy",
+};
+
+const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR]	= "SymbolErrorDuringCarrier",
+};
+
+const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = {
+	[ETHTOOL_A_STATS_HEADER]	=
+		NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_STATS_GROUPS]	= { .type = NLA_NESTED },
+};
+
+static int stats_parse_request(struct ethnl_req_info *req_base,
+			       struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
+{
+	struct stats_req_info *req_info = STATS_REQINFO(req_base);
+	bool mod = false;
+	int err;
+
+	err = ethnl_update_bitset(req_info->stat_mask, __ETHTOOL_STATS_CNT,
+				  tb[ETHTOOL_A_STATS_GROUPS], stats_std_names,
+				  extack, &mod);
+	if (err)
+		return err;
+
+	if (!mod) {
+		NL_SET_ERR_MSG(extack, "no stats requested");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int stats_prepare_data(const struct ethnl_req_info *req_base,
+			      struct ethnl_reply_data *reply_base,
+			      struct genl_info *info)
+{
+	const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+	struct stats_reply_data *data = STATS_REPDATA(reply_base);
+	struct net_device *dev = reply_base->dev;
+	int ret;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		return ret;
+
+	memset(&data->phy_stats, 0xff, sizeof(data->phy_stats));
+
+	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
+	    dev->ethtool_ops->get_eth_phy_stats)
+		dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats);
+
+	ethnl_ops_complete(dev);
+	return 0;
+}
+
+static int stats_reply_size(const struct ethnl_req_info *req_base,
+			    const struct ethnl_reply_data *reply_base)
+{
+	const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+	unsigned int n_grps = 0, n_stats = 0;
+	int len = 0;
+
+	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) {
+		n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64);
+		n_grps++;
+	}
+
+	len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
+			 nla_total_size(4) + /* _A_STATS_GRP_ID */
+			 nla_total_size(4)); /* _A_STATS_GRP_SS_ID */
+	len += n_stats * (nla_total_size(0) + /* _A_STATS_GRP_STAT */
+			  nla_total_size_64bit(sizeof(u64)));
+
+	return len;
+}
+
+static int stat_put(struct sk_buff *skb, u16 attrtype, u64 val)
+{
+	struct nlattr *nest;
+	int ret;
+
+	if (val == ETHTOOL_STAT_NOT_SET)
+		return 0;
+
+	/* We want to start stats attr types from 0, so we don't have a type
+	 * for pad inside ETHTOOL_A_STATS_GRP_STAT. Pad things on the outside
+	 * of ETHTOOL_A_STATS_GRP_STAT. Since we're one nest away from the
+	 * actual attr we're 4B off - nla_need_padding_for_64bit() & co.
+	 * can't be used.
+	 */
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (!IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8))
+		if (!nla_reserve(skb, ETHTOOL_A_STATS_GRP_PAD, 0))
+			return -EMSGSIZE;
+#endif
+
+	nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP_STAT);
+	if (!nest)
+		return -EMSGSIZE;
+
+	ret = nla_put_u64_64bit(skb, attrtype, val, -1 /* not used */);
+	if (ret) {
+		nla_nest_cancel(skb, nest);
+		return ret;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+}
+
+static int stats_put_phy_stats(struct sk_buff *skb,
+			       const struct stats_reply_data *data)
+{
+	if (stat_put(skb, ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR,
+		     data->phy_stats.SymbolErrorDuringCarrier))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int stats_put_stats(struct sk_buff *skb,
+			   const struct stats_reply_data *data,
+			   u32 id, u32 ss_id,
+			   int (*cb)(struct sk_buff *skb,
+				     const struct stats_reply_data *data))
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_ID, id) ||
+	    nla_put_u32(skb, ETHTOOL_A_STATS_GRP_SS_ID, ss_id))
+		goto err_cancel;
+
+	if (cb(skb, data))
+		goto err_cancel;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+err_cancel:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int stats_fill_reply(struct sk_buff *skb,
+			    const struct ethnl_req_info *req_base,
+			    const struct ethnl_reply_data *reply_base)
+{
+	const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+	const struct stats_reply_data *data = STATS_REPDATA(reply_base);
+	int ret = 0;
+
+	if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask))
+		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY,
+				      ETH_SS_STATS_ETH_PHY,
+				      stats_put_phy_stats);
+
+	return ret;
+}
+
+const struct ethnl_request_ops ethnl_stats_request_ops = {
+	.request_cmd		= ETHTOOL_MSG_STATS_GET,
+	.reply_cmd		= ETHTOOL_MSG_STATS_GET_REPLY,
+	.hdr_attr		= ETHTOOL_A_STATS_HEADER,
+	.req_info_size		= sizeof(struct stats_req_info),
+	.reply_data_size	= sizeof(struct stats_reply_data),
+
+	.parse_request		= stats_parse_request,
+	.prepare_data		= stats_prepare_data,
+	.reply_size		= stats_reply_size,
+	.fill_reply		= stats_fill_reply,
+};
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index c3a5489964cd..5f3c73587ff4 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -80,6 +80,16 @@ static const struct strset_info info_template[] = {
 		.count		= __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
 		.strings	= udp_tunnel_type_names,
 	},
+	[ETH_SS_STATS_STD] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_STATS_CNT,
+		.strings	= stats_std_names,
+	},
+	[ETH_SS_STATS_ETH_PHY] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_A_STATS_ETH_PHY_CNT,
+		.strings	= stats_eth_phy_names,
+	},
 };
 
 struct strset_req_info {
-- 
cgit v1.2.3-71-gd317


From ca2244547ec7505d1cf61d43f5e76e3ffd99cf77 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 16 Apr 2021 12:27:40 -0700
Subject: ethtool: add interface to read standard MAC stats

Most of the MAC statistics are included in
struct rtnl_link_stats64, but some fields
are aggregated. Besides it's good to expose
these clearly hardware stats separately.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h              | 31 +++++++++++++
 include/uapi/linux/ethtool.h         |  2 +
 include/uapi/linux/ethtool_netlink.h | 53 +++++++++++++++++++++
 net/ethtool/netlink.h                |  1 +
 net/ethtool/stats.c                  | 90 ++++++++++++++++++++++++++++++++++++
 net/ethtool/strset.c                 |  5 ++
 6 files changed, 182 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 2d5455eedbf4..3c689a13e679 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -250,6 +250,34 @@ static inline void ethtool_stats_init(u64 *stats, unsigned int n)
 		stats[n] = ETHTOOL_STAT_NOT_SET;
 }
 
+/* Basic IEEE 802.3 MAC statistics (30.3.1.1.*), not otherwise exposed
+ * via a more targeted API.
+ */
+struct ethtool_eth_mac_stats {
+	u64 FramesTransmittedOK;
+	u64 SingleCollisionFrames;
+	u64 MultipleCollisionFrames;
+	u64 FramesReceivedOK;
+	u64 FrameCheckSequenceErrors;
+	u64 AlignmentErrors;
+	u64 OctetsTransmittedOK;
+	u64 FramesWithDeferredXmissions;
+	u64 LateCollisions;
+	u64 FramesAbortedDueToXSColls;
+	u64 FramesLostDueToIntMACXmitError;
+	u64 CarrierSenseErrors;
+	u64 OctetsReceivedOK;
+	u64 FramesLostDueToIntMACRcvError;
+	u64 MulticastFramesXmittedOK;
+	u64 BroadcastFramesXmittedOK;
+	u64 FramesWithExcessiveDeferral;
+	u64 MulticastFramesReceivedOK;
+	u64 BroadcastFramesReceivedOK;
+	u64 InRangeLengthErrors;
+	u64 OutOfRangeLengthField;
+	u64 FrameTooLongErrors;
+};
+
 /* Basic IEEE 802.3 PHY statistics (30.3.2.1.*), not otherwise exposed
  * via a more targeted API.
  */
@@ -495,6 +523,7 @@ struct ethtool_module_eeprom {
  *	specified page. Returns a negative error code or the amount of bytes
  *	read.
  * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics.
+ * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -607,6 +636,8 @@ struct ethtool_ops {
 					     struct netlink_ext_ack *extack);
 	void	(*get_eth_phy_stats)(struct net_device *dev,
 				     struct ethtool_eth_phy_stats *phy_stats);
+	void	(*get_eth_mac_stats)(struct net_device *dev,
+				     struct ethtool_eth_mac_stats *mac_stats);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 190ae6e03918..c227376d811a 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -671,6 +671,7 @@ enum ethtool_link_ext_substate_cable_issue {
  * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types
  * @ETH_SS_STATS_STD: standardized stats
  * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics
+ * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics
  *
  * @ETH_SS_COUNT: number of defined string sets
  */
@@ -693,6 +694,7 @@ enum ethtool_stringset {
 	ETH_SS_UDP_TUNNEL_TYPES,
 	ETH_SS_STATS_STD,
 	ETH_SS_STATS_ETH_PHY,
+	ETH_SS_STATS_ETH_MAC,
 
 	/* add new constants above here */
 	ETH_SS_COUNT
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index a54cfe625f34..f0fbe8f4eb1b 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -698,6 +698,7 @@ enum {
 
 enum {
 	ETHTOOL_STATS_ETH_PHY,
+	ETHTOOL_STATS_ETH_MAC,
 
 	/* add new constants above here */
 	__ETHTOOL_STATS_CNT
@@ -726,6 +727,58 @@ enum {
 	ETHTOOL_A_STATS_ETH_PHY_MAX = (__ETHTOOL_A_STATS_ETH_PHY_CNT - 1)
 };
 
+enum {
+	/* 30.3.1.1.2 aFramesTransmittedOK */
+	ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT,
+	/* 30.3.1.1.3 aSingleCollisionFrames */
+	ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL,
+	/* 30.3.1.1.4 aMultipleCollisionFrames */
+	ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL,
+	/* 30.3.1.1.5 aFramesReceivedOK */
+	ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT,
+	/* 30.3.1.1.6 aFrameCheckSequenceErrors */
+	ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR,
+	/* 30.3.1.1.7 aAlignmentErrors */
+	ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR,
+	/* 30.3.1.1.8 aOctetsTransmittedOK */
+	ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES,
+	/* 30.3.1.1.9 aFramesWithDeferredXmissions */
+	ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER,
+	/* 30.3.1.1.10 aLateCollisions */
+	ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL,
+	/* 30.3.1.1.11 aFramesAbortedDueToXSColls */
+	ETHTOOL_A_STATS_ETH_MAC_11_XS_COL,
+	/* 30.3.1.1.12 aFramesLostDueToIntMACXmitError */
+	ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR,
+	/* 30.3.1.1.13 aCarrierSenseErrors */
+	ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR,
+	/* 30.3.1.1.14 aOctetsReceivedOK */
+	ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES,
+	/* 30.3.1.1.15 aFramesLostDueToIntMACRcvError */
+	ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR,
+
+	/* 30.3.1.1.18 aMulticastFramesXmittedOK */
+	ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST,
+	/* 30.3.1.1.19 aBroadcastFramesXmittedOK */
+	ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST,
+	/* 30.3.1.1.20 aFramesWithExcessiveDeferral */
+	ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER,
+	/* 30.3.1.1.21 aMulticastFramesReceivedOK */
+	ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST,
+	/* 30.3.1.1.22 aBroadcastFramesReceivedOK */
+	ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST,
+	/* 30.3.1.1.23 aInRangeLengthErrors */
+	ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR,
+	/* 30.3.1.1.24 aOutOfRangeLengthField */
+	ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN,
+	/* 30.3.1.1.25 aFrameTooLongErrors */
+	ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR,
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_ETH_MAC_CNT,
+	ETHTOOL_A_STATS_ETH_MAC_MAX = (__ETHTOOL_A_STATS_ETH_MAC_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 9d88983b6597..c70bac5329af 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -403,5 +403,6 @@ int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info);
 
 extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN];
 
 #endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
index fd8f47178c06..e80175872226 100644
--- a/net/ethtool/stats.c
+++ b/net/ethtool/stats.c
@@ -15,6 +15,7 @@ struct stats_req_info {
 struct stats_reply_data {
 	struct ethnl_reply_data		base;
 	struct ethtool_eth_phy_stats	phy_stats;
+	struct ethtool_eth_mac_stats	mac_stats;
 };
 
 #define STATS_REPDATA(__reply_base) \
@@ -22,12 +23,38 @@ struct stats_reply_data {
 
 const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_STATS_ETH_PHY]			= "eth-phy",
+	[ETHTOOL_STATS_ETH_MAC]			= "eth-mac",
 };
 
 const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR]	= "SymbolErrorDuringCarrier",
 };
 
+const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT]	= "FramesTransmittedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL]	= "SingleCollisionFrames",
+	[ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL]	= "MultipleCollisionFrames",
+	[ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT]	= "FramesReceivedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR]	= "FrameCheckSequenceErrors",
+	[ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR]	= "AlignmentErrors",
+	[ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES]	= "OctetsTransmittedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER]	= "FramesWithDeferredXmissions",
+	[ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL]	= "LateCollisions",
+	[ETHTOOL_A_STATS_ETH_MAC_11_XS_COL]	= "FramesAbortedDueToXSColls",
+	[ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR]	= "FramesLostDueToIntMACXmitError",
+	[ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR]	= "CarrierSenseErrors",
+	[ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES]	= "OctetsReceivedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR]	= "FramesLostDueToIntMACRcvError",
+	[ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST]	= "MulticastFramesXmittedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST]	= "BroadcastFramesXmittedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER]	= "FramesWithExcessiveDeferral",
+	[ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST]	= "MulticastFramesReceivedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST]	= "BroadcastFramesReceivedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR]	= "InRangeLengthErrors",
+	[ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN]	= "OutOfRangeLengthField",
+	[ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR]	= "FrameTooLongErrors",
+};
+
 const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = {
 	[ETHTOOL_A_STATS_HEADER]	=
 		NLA_POLICY_NESTED(ethnl_header_policy),
@@ -70,10 +97,14 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 		return ret;
 
 	memset(&data->phy_stats, 0xff, sizeof(data->phy_stats));
+	memset(&data->mac_stats, 0xff, sizeof(data->mac_stats));
 
 	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_phy_stats)
 		dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats);
+	if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) &&
+	    dev->ethtool_ops->get_eth_mac_stats)
+		dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats);
 
 	ethnl_ops_complete(dev);
 	return 0;
@@ -90,6 +121,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base,
 		n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64);
 		n_grps++;
 	}
+	if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) {
+		n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64);
+		n_grps++;
+	}
 
 	len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
 			 nla_total_size(4) + /* _A_STATS_GRP_ID */
@@ -143,6 +178,57 @@ static int stats_put_phy_stats(struct sk_buff *skb,
 	return 0;
 }
 
+static int stats_put_mac_stats(struct sk_buff *skb,
+			       const struct stats_reply_data *data)
+{
+	if (stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT,
+		     data->mac_stats.FramesTransmittedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL,
+		     data->mac_stats.SingleCollisionFrames) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL,
+		     data->mac_stats.MultipleCollisionFrames) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT,
+		     data->mac_stats.FramesReceivedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR,
+		     data->mac_stats.FrameCheckSequenceErrors) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR,
+		     data->mac_stats.AlignmentErrors) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES,
+		     data->mac_stats.OctetsTransmittedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER,
+		     data->mac_stats.FramesWithDeferredXmissions) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL,
+		     data->mac_stats.LateCollisions) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_11_XS_COL,
+		     data->mac_stats.FramesAbortedDueToXSColls) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR,
+		     data->mac_stats.FramesLostDueToIntMACXmitError) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR,
+		     data->mac_stats.CarrierSenseErrors) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES,
+		     data->mac_stats.OctetsReceivedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR,
+		     data->mac_stats.FramesLostDueToIntMACRcvError) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST,
+		     data->mac_stats.MulticastFramesXmittedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST,
+		     data->mac_stats.BroadcastFramesXmittedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER,
+		     data->mac_stats.FramesWithExcessiveDeferral) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST,
+		     data->mac_stats.MulticastFramesReceivedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST,
+		     data->mac_stats.BroadcastFramesReceivedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR,
+		     data->mac_stats.InRangeLengthErrors) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN,
+		     data->mac_stats.OutOfRangeLengthField) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR,
+		     data->mac_stats.FrameTooLongErrors))
+		return -EMSGSIZE;
+	return 0;
+}
+
 static int stats_put_stats(struct sk_buff *skb,
 			   const struct stats_reply_data *data,
 			   u32 id, u32 ss_id,
@@ -182,6 +268,10 @@ static int stats_fill_reply(struct sk_buff *skb,
 		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY,
 				      ETH_SS_STATS_ETH_PHY,
 				      stats_put_phy_stats);
+	if (!ret && test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask))
+		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC,
+				      ETH_SS_STATS_ETH_MAC,
+				      stats_put_mac_stats);
 
 	return ret;
 }
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index 5f3c73587ff4..a8aac7bcfcc9 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -90,6 +90,11 @@ static const struct strset_info info_template[] = {
 		.count		= __ETHTOOL_A_STATS_ETH_PHY_CNT,
 		.strings	= stats_eth_phy_names,
 	},
+	[ETH_SS_STATS_ETH_MAC] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_A_STATS_ETH_MAC_CNT,
+		.strings	= stats_eth_mac_names,
+	},
 };
 
 struct strset_req_info {
-- 
cgit v1.2.3-71-gd317


From bfad2b979ddcc330c08bb071eb3c3f7b3411a681 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 16 Apr 2021 12:27:41 -0700
Subject: ethtool: add interface to read standard MAC Ctrl stats

Number of devices maintains the standard-based MAC control
counters for control frames. Add a API for those.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h              | 12 ++++++++++++
 include/uapi/linux/ethtool.h         |  2 ++
 include/uapi/linux/ethtool_netlink.h | 14 ++++++++++++++
 net/ethtool/netlink.h                |  1 +
 net/ethtool/stats.c                  | 33 +++++++++++++++++++++++++++++++++
 net/ethtool/strset.c                 |  5 +++++
 6 files changed, 67 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 3c689a13e679..1ca6b836f9fe 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -285,6 +285,15 @@ struct ethtool_eth_phy_stats {
 	u64 SymbolErrorDuringCarrier;
 };
 
+/* Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*), not otherwise exposed
+ * via a more targeted API.
+ */
+struct ethtool_eth_ctrl_stats {
+	u64 MACControlFramesTransmitted;
+	u64 MACControlFramesReceived;
+	u64 UnsupportedOpcodesReceived;
+};
+
 /**
  * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames
  * @tx_pause_frames: transmitted pause frame count. Reported to user space
@@ -524,6 +533,7 @@ struct ethtool_module_eeprom {
  *	read.
  * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics.
  * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics.
+ * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -638,6 +648,8 @@ struct ethtool_ops {
 				     struct ethtool_eth_phy_stats *phy_stats);
 	void	(*get_eth_mac_stats)(struct net_device *dev,
 				     struct ethtool_eth_mac_stats *mac_stats);
+	void	(*get_eth_ctrl_stats)(struct net_device *dev,
+				      struct ethtool_eth_ctrl_stats *ctrl_stats);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index c227376d811a..9cb8df89d4f2 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -672,6 +672,7 @@ enum ethtool_link_ext_substate_cable_issue {
  * @ETH_SS_STATS_STD: standardized stats
  * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics
  * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics
+ * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics
  *
  * @ETH_SS_COUNT: number of defined string sets
  */
@@ -695,6 +696,7 @@ enum ethtool_stringset {
 	ETH_SS_STATS_STD,
 	ETH_SS_STATS_ETH_PHY,
 	ETH_SS_STATS_ETH_MAC,
+	ETH_SS_STATS_ETH_CTRL,
 
 	/* add new constants above here */
 	ETH_SS_COUNT
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index f0fbe8f4eb1b..2ea5f049df6a 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -699,6 +699,7 @@ enum {
 enum {
 	ETHTOOL_STATS_ETH_PHY,
 	ETHTOOL_STATS_ETH_MAC,
+	ETHTOOL_STATS_ETH_CTRL,
 
 	/* add new constants above here */
 	__ETHTOOL_STATS_CNT
@@ -779,6 +780,19 @@ enum {
 	ETHTOOL_A_STATS_ETH_MAC_MAX = (__ETHTOOL_A_STATS_ETH_MAC_CNT - 1)
 };
 
+enum {
+	/* 30.3.3.3 aMACControlFramesTransmitted */
+	ETHTOOL_A_STATS_ETH_CTRL_3_TX,
+	/* 30.3.3.4 aMACControlFramesReceived */
+	ETHTOOL_A_STATS_ETH_CTRL_4_RX,
+	/* 30.3.3.5 aUnsupportedOpcodesReceived */
+	ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP,
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_ETH_CTRL_CNT,
+	ETHTOOL_A_STATS_ETH_CTRL_MAX = (__ETHTOOL_A_STATS_ETH_CTRL_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index c70bac5329af..febfa61e52e2 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -404,5 +404,6 @@ int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info);
 extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN];
 
 #endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
index e80175872226..f4fded66731c 100644
--- a/net/ethtool/stats.c
+++ b/net/ethtool/stats.c
@@ -16,6 +16,7 @@ struct stats_reply_data {
 	struct ethnl_reply_data		base;
 	struct ethtool_eth_phy_stats	phy_stats;
 	struct ethtool_eth_mac_stats	mac_stats;
+	struct ethtool_eth_ctrl_stats	ctrl_stats;
 };
 
 #define STATS_REPDATA(__reply_base) \
@@ -24,6 +25,7 @@ struct stats_reply_data {
 const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_STATS_ETH_PHY]			= "eth-phy",
 	[ETHTOOL_STATS_ETH_MAC]			= "eth-mac",
+	[ETHTOOL_STATS_ETH_CTRL]		= "eth-ctrl",
 };
 
 const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
@@ -55,6 +57,12 @@ const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] =
 	[ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR]	= "FrameTooLongErrors",
 };
 
+const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_A_STATS_ETH_CTRL_3_TX]		= "MACControlFramesTransmitted",
+	[ETHTOOL_A_STATS_ETH_CTRL_4_RX]		= "MACControlFramesReceived",
+	[ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP]	= "UnsupportedOpcodesReceived",
+};
+
 const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = {
 	[ETHTOOL_A_STATS_HEADER]	=
 		NLA_POLICY_NESTED(ethnl_header_policy),
@@ -98,6 +106,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 
 	memset(&data->phy_stats, 0xff, sizeof(data->phy_stats));
 	memset(&data->mac_stats, 0xff, sizeof(data->mac_stats));
+	memset(&data->ctrl_stats, 0xff, sizeof(data->mac_stats));
 
 	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_phy_stats)
@@ -105,6 +114,9 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 	if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_mac_stats)
 		dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats);
+	if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) &&
+	    dev->ethtool_ops->get_eth_ctrl_stats)
+		dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats);
 
 	ethnl_ops_complete(dev);
 	return 0;
@@ -125,6 +137,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base,
 		n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64);
 		n_grps++;
 	}
+	if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) {
+		n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64);
+		n_grps++;
+	}
 
 	len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
 			 nla_total_size(4) + /* _A_STATS_GRP_ID */
@@ -229,6 +245,19 @@ static int stats_put_mac_stats(struct sk_buff *skb,
 	return 0;
 }
 
+static int stats_put_ctrl_stats(struct sk_buff *skb,
+				const struct stats_reply_data *data)
+{
+	if (stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_3_TX,
+		     data->ctrl_stats.MACControlFramesTransmitted) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_4_RX,
+		     data->ctrl_stats.MACControlFramesReceived) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP,
+		     data->ctrl_stats.UnsupportedOpcodesReceived))
+		return -EMSGSIZE;
+	return 0;
+}
+
 static int stats_put_stats(struct sk_buff *skb,
 			   const struct stats_reply_data *data,
 			   u32 id, u32 ss_id,
@@ -272,6 +301,10 @@ static int stats_fill_reply(struct sk_buff *skb,
 		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC,
 				      ETH_SS_STATS_ETH_MAC,
 				      stats_put_mac_stats);
+	if (!ret && test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask))
+		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL,
+				      ETH_SS_STATS_ETH_CTRL,
+				      stats_put_ctrl_stats);
 
 	return ret;
 }
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index a8aac7bcfcc9..a33c603a7a02 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -95,6 +95,11 @@ static const struct strset_info info_template[] = {
 		.count		= __ETHTOOL_A_STATS_ETH_MAC_CNT,
 		.strings	= stats_eth_mac_names,
 	},
+	[ETH_SS_STATS_ETH_CTRL] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_A_STATS_ETH_CTRL_CNT,
+		.strings	= stats_eth_ctrl_names,
+	},
 };
 
 struct strset_req_info {
-- 
cgit v1.2.3-71-gd317


From a8b06e9d40d8b18c41c8ce060e8dc004fa59e708 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 16 Apr 2021 12:27:42 -0700
Subject: ethtool: add interface to read RMON stats

Most devices maintain RMON (RFC 2819) stats - particularly
the "histogram" of packets received by size. Unlike other
RFCs which duplicate IEEE stats, the short/oversized frame
counters in RMON don't seem to match IEEE stats 1-to-1 either,
so expose those, too. Do not expose basic packet, CRC errors
etc - those are already otherwise covered.

Because standard defines packet ranges only up to 1518, and
everything above that should theoretically be "oversized"
- devices often create their own ranges.

Going beyond what the RFC defines - expose the "histogram"
in the Tx direction (assume for now that the ranges will
be the same).

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h              | 43 ++++++++++++++++++
 include/uapi/linux/ethtool.h         |  2 +
 include/uapi/linux/ethtool_netlink.h | 23 ++++++++++
 net/ethtool/netlink.h                |  1 +
 net/ethtool/stats.c                  | 87 ++++++++++++++++++++++++++++++++++++
 net/ethtool/strset.c                 |  5 +++
 6 files changed, 161 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 1ca6b836f9fe..e030f7510cd3 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -346,6 +346,44 @@ struct ethtool_fec_stats {
 	} corrected_blocks, uncorrectable_blocks, corrected_bits;
 };
 
+/**
+ * struct ethtool_rmon_hist_range - byte range for histogram statistics
+ * @low: low bound of the bucket (inclusive)
+ * @high: high bound of the bucket (inclusive)
+ */
+struct ethtool_rmon_hist_range {
+	u16 low;
+	u16 high;
+};
+
+#define ETHTOOL_RMON_HIST_MAX	10
+
+/**
+ * struct ethtool_rmon_stats - selected RMON (RFC 2819) statistics
+ * @undersize_pkts: Equivalent to `etherStatsUndersizePkts` from the RFC.
+ * @oversize_pkts: Equivalent to `etherStatsOversizePkts` from the RFC.
+ * @fragments: Equivalent to `etherStatsFragments` from the RFC.
+ * @jabbers: Equivalent to `etherStatsJabbers` from the RFC.
+ * @hist: Packet counter for packet length buckets (e.g.
+ *	`etherStatsPkts128to255Octets` from the RFC).
+ * @hist_tx: Tx counters in similar form to @hist, not defined in the RFC.
+ *
+ * Selection of RMON (RFC 2819) statistics which are not exposed via different
+ * APIs, primarily the packet-length-based counters.
+ * Unfortunately different designs choose different buckets beyond
+ * the 1024B mark (jumbo frame teritory), so the definition of the bucket
+ * ranges is left to the driver.
+ */
+struct ethtool_rmon_stats {
+	u64 undersize_pkts;
+	u64 oversize_pkts;
+	u64 fragments;
+	u64 jabbers;
+
+	u64 hist[ETHTOOL_RMON_HIST_MAX];
+	u64 hist_tx[ETHTOOL_RMON_HIST_MAX];
+};
+
 #define ETH_MODULE_EEPROM_PAGE_LEN	128
 #define ETH_MODULE_MAX_I2C_ADDRESS	0x7f
 
@@ -534,6 +572,8 @@ struct ethtool_module_eeprom {
  * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics.
  * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics.
  * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics.
+ * @get_rmon_stats: Query some of the RMON (RFC 2819) statistics.
+ *	Set %ranges to a pointer to zero-terminated array of byte ranges.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -650,6 +690,9 @@ struct ethtool_ops {
 				     struct ethtool_eth_mac_stats *mac_stats);
 	void	(*get_eth_ctrl_stats)(struct net_device *dev,
 				      struct ethtool_eth_ctrl_stats *ctrl_stats);
+	void	(*get_rmon_stats)(struct net_device *dev,
+				  struct ethtool_rmon_stats *rmon_stats,
+				  const struct ethtool_rmon_hist_range **ranges);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 9cb8df89d4f2..cfef6b08169a 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -673,6 +673,7 @@ enum ethtool_link_ext_substate_cable_issue {
  * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics
  * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics
  * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics
+ * @ETH_SS_STATS_RMON: names of RMON statistics
  *
  * @ETH_SS_COUNT: number of defined string sets
  */
@@ -697,6 +698,7 @@ enum ethtool_stringset {
 	ETH_SS_STATS_ETH_PHY,
 	ETH_SS_STATS_ETH_MAC,
 	ETH_SS_STATS_ETH_CTRL,
+	ETH_SS_STATS_RMON,
 
 	/* add new constants above here */
 	ETH_SS_COUNT
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 2ea5f049df6a..825cfda1c5d5 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -700,6 +700,7 @@ enum {
 	ETHTOOL_STATS_ETH_PHY,
 	ETHTOOL_STATS_ETH_MAC,
 	ETHTOOL_STATS_ETH_CTRL,
+	ETHTOOL_STATS_RMON,
 
 	/* add new constants above here */
 	__ETHTOOL_STATS_CNT
@@ -714,6 +715,13 @@ enum {
 
 	ETHTOOL_A_STATS_GRP_STAT,		/* nest */
 
+	ETHTOOL_A_STATS_GRP_HIST_RX,		/* nest */
+	ETHTOOL_A_STATS_GRP_HIST_TX,		/* nest */
+
+	ETHTOOL_A_STATS_GRP_HIST_BKT_LOW,	/* u32 */
+	ETHTOOL_A_STATS_GRP_HIST_BKT_HI,	/* u32 */
+	ETHTOOL_A_STATS_GRP_HIST_VAL,		/* u64 */
+
 	/* add new constants above here */
 	__ETHTOOL_A_STATS_GRP_CNT,
 	ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_CNT - 1)
@@ -793,6 +801,21 @@ enum {
 	ETHTOOL_A_STATS_ETH_CTRL_MAX = (__ETHTOOL_A_STATS_ETH_CTRL_CNT - 1)
 };
 
+enum {
+	/* etherStatsUndersizePkts */
+	ETHTOOL_A_STATS_RMON_UNDERSIZE,
+	/* etherStatsOversizePkts */
+	ETHTOOL_A_STATS_RMON_OVERSIZE,
+	/* etherStatsFragments */
+	ETHTOOL_A_STATS_RMON_FRAG,
+	/* etherStatsJabbers */
+	ETHTOOL_A_STATS_RMON_JABBER,
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_RMON_CNT,
+	ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index febfa61e52e2..bed3afdf3656 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -405,5 +405,6 @@ extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN];
+extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN];
 
 #endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
index f4fded66731c..acb2b080c358 100644
--- a/net/ethtool/stats.c
+++ b/net/ethtool/stats.c
@@ -17,6 +17,8 @@ struct stats_reply_data {
 	struct ethtool_eth_phy_stats	phy_stats;
 	struct ethtool_eth_mac_stats	mac_stats;
 	struct ethtool_eth_ctrl_stats	ctrl_stats;
+	struct ethtool_rmon_stats	rmon_stats;
+	const struct ethtool_rmon_hist_range	*rmon_ranges;
 };
 
 #define STATS_REPDATA(__reply_base) \
@@ -26,6 +28,7 @@ const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_STATS_ETH_PHY]			= "eth-phy",
 	[ETHTOOL_STATS_ETH_MAC]			= "eth-mac",
 	[ETHTOOL_STATS_ETH_CTRL]		= "eth-ctrl",
+	[ETHTOOL_STATS_RMON]			= "rmon",
 };
 
 const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
@@ -63,6 +66,13 @@ const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]
 	[ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP]	= "UnsupportedOpcodesReceived",
 };
 
+const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_A_STATS_RMON_UNDERSIZE]	= "etherStatsUndersizePkts",
+	[ETHTOOL_A_STATS_RMON_OVERSIZE]		= "etherStatsOversizePkts",
+	[ETHTOOL_A_STATS_RMON_FRAG]		= "etherStatsFragments",
+	[ETHTOOL_A_STATS_RMON_JABBER]		= "etherStatsJabbers",
+};
+
 const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = {
 	[ETHTOOL_A_STATS_HEADER]	=
 		NLA_POLICY_NESTED(ethnl_header_policy),
@@ -107,6 +117,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 	memset(&data->phy_stats, 0xff, sizeof(data->phy_stats));
 	memset(&data->mac_stats, 0xff, sizeof(data->mac_stats));
 	memset(&data->ctrl_stats, 0xff, sizeof(data->mac_stats));
+	memset(&data->rmon_stats, 0xff, sizeof(data->rmon_stats));
 
 	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_phy_stats)
@@ -117,6 +128,10 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 	if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_ctrl_stats)
 		dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats);
+	if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask) &&
+	    dev->ethtool_ops->get_rmon_stats)
+		dev->ethtool_ops->get_rmon_stats(dev, &data->rmon_stats,
+						 &data->rmon_ranges);
 
 	ethnl_ops_complete(dev);
 	return 0;
@@ -141,6 +156,16 @@ static int stats_reply_size(const struct ethnl_req_info *req_base,
 		n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64);
 		n_grps++;
 	}
+	if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) {
+		n_stats += sizeof(struct ethtool_rmon_stats) / sizeof(u64);
+		n_grps++;
+		/* Above includes the space for _A_STATS_GRP_HIST_VALs */
+
+		len += (nla_total_size(0) +	/* _A_STATS_GRP_HIST */
+			nla_total_size(4) +	/* _A_STATS_GRP_HIST_BKT_LOW */
+			nla_total_size(4)) *	/* _A_STATS_GRP_HIST_BKT_HI */
+			ETHTOOL_RMON_HIST_MAX * 2;
+	}
 
 	len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
 			 nla_total_size(4) + /* _A_STATS_GRP_ID */
@@ -258,6 +283,65 @@ static int stats_put_ctrl_stats(struct sk_buff *skb,
 	return 0;
 }
 
+static int stats_put_rmon_hist(struct sk_buff *skb, u32 attr, const u64 *hist,
+			       const struct ethtool_rmon_hist_range *ranges)
+{
+	struct nlattr *nest;
+	int i;
+
+	if (!ranges)
+		return 0;
+
+	for (i = 0; i <	ETHTOOL_RMON_HIST_MAX; i++) {
+		if (!ranges[i].low && !ranges[i].high)
+			break;
+		if (hist[i] == ETHTOOL_STAT_NOT_SET)
+			continue;
+
+		nest = nla_nest_start(skb, attr);
+		if (!nest)
+			return -EMSGSIZE;
+
+		if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_LOW,
+				ranges[i].low) ||
+		    nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_HI,
+				ranges[i].high) ||
+		    nla_put_u64_64bit(skb, ETHTOOL_A_STATS_GRP_HIST_VAL,
+				      hist[i], ETHTOOL_A_STATS_GRP_PAD))
+			goto err_cancel_hist;
+
+		nla_nest_end(skb, nest);
+	}
+
+	return 0;
+
+err_cancel_hist:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int stats_put_rmon_stats(struct sk_buff *skb,
+				const struct stats_reply_data *data)
+{
+	if (stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_RX,
+				data->rmon_stats.hist, data->rmon_ranges) ||
+	    stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_TX,
+				data->rmon_stats.hist_tx, data->rmon_ranges))
+		return -EMSGSIZE;
+
+	if (stat_put(skb, ETHTOOL_A_STATS_RMON_UNDERSIZE,
+		     data->rmon_stats.undersize_pkts) ||
+	    stat_put(skb, ETHTOOL_A_STATS_RMON_OVERSIZE,
+		     data->rmon_stats.oversize_pkts) ||
+	    stat_put(skb, ETHTOOL_A_STATS_RMON_FRAG,
+		     data->rmon_stats.fragments) ||
+	    stat_put(skb, ETHTOOL_A_STATS_RMON_JABBER,
+		     data->rmon_stats.jabbers))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static int stats_put_stats(struct sk_buff *skb,
 			   const struct stats_reply_data *data,
 			   u32 id, u32 ss_id,
@@ -305,6 +389,9 @@ static int stats_fill_reply(struct sk_buff *skb,
 		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL,
 				      ETH_SS_STATS_ETH_CTRL,
 				      stats_put_ctrl_stats);
+	if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask))
+		ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON,
+				      ETH_SS_STATS_RMON, stats_put_rmon_stats);
 
 	return ret;
 }
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index a33c603a7a02..b3029fff715d 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -100,6 +100,11 @@ static const struct strset_info info_template[] = {
 		.count		= __ETHTOOL_A_STATS_ETH_CTRL_CNT,
 		.strings	= stats_eth_ctrl_names,
 	},
+	[ETH_SS_STATS_RMON] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_A_STATS_RMON_CNT,
+		.strings	= stats_rmon_names,
+	},
 };
 
 struct strset_req_info {
-- 
cgit v1.2.3-71-gd317


From 8b13c36493d8cb56fc3b386507873c5412b7108d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 6 Apr 2021 10:24:07 -0400
Subject: KVM: introduce KVM_CAP_SET_GUEST_DEBUG2

This capability will allow the user to know which KVM_GUESTDBG_* bits
are supported.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210401135451.1004564-3-mlevitsk@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 3 +++
 include/uapi/linux/kvm.h       | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 38e327d4b479..9778b2434c03 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -3357,6 +3357,9 @@ indicating the number of supported registers.
 For ppc, the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability indicates whether
 the single-step debug event (KVM_GUESTDBG_SINGLESTEP) is supported.
 
+Also when supported, KVM_CAP_SET_GUEST_DEBUG2 capability indicates the
+supported KVM_GUESTDBG_* bits in the control field.
+
 When debug events exit the main run loop with the reason
 KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 structure containing architecture specific debug information.
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f6afee209620..727010788eff 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1078,6 +1078,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DIRTY_LOG_RING 192
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
 #define KVM_CAP_PPC_DAWR1 194
+#define KVM_CAP_SET_GUEST_DEBUG2 195
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-71-gd317


From 73807523f9a6612106582ab19217f280ed128f24 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 9 Apr 2021 12:40:25 +0300
Subject: nl80211/cfg80211: add a flag to negotiate for LMR feedback in NDP
 ranging

Add a flag that indicates that the ISTA shall indicate support for
LMR feedback in NDP ranging negotiation.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20210409123755.eff546283504.I2606161e700ac24d94d0b50c8edcdedd4c0395c2@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  5 ++++-
 include/uapi/linux/nl80211.h |  4 ++++
 net/wireless/nl80211.c       |  1 +
 net/wireless/pmsr.c          | 12 +++++++++++-
 4 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 73b17ea89248..5224f885a99a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3521,6 +3521,8 @@ struct cfg80211_pmsr_result {
  * @non_trigger_based: use non trigger based ranging for the measurement
  *		 If neither @trigger_based nor @non_trigger_based is set,
  *		 EDCA based ranging will be used.
+ * @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either
+ *	@trigger_based or @non_trigger_based is set.
  *
  * See also nl80211 for the respective attribute documentation.
  */
@@ -3532,7 +3534,8 @@ struct cfg80211_pmsr_ftm_request_peer {
 	   request_lci:1,
 	   request_civicloc:1,
 	   trigger_based:1,
-	   non_trigger_based:1;
+	   non_trigger_based:1,
+	   lmr_feedback:1;
 	u8 num_bursts_exp;
 	u8 burst_duration;
 	u8 ftms_per_burst;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 18dfe744bcb5..00f696d177e6 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6896,6 +6896,9 @@ enum nl80211_peer_measurement_ftm_capa {
  *      if neither %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED nor
  *	%NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set, EDCA based
  *	ranging will be used.
+ * @NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK: negotiate for LMR feedback. Only
+ *	valid if either %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED or
+ *	%NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set.
  *
  * @NUM_NL80211_PMSR_FTM_REQ_ATTR: internal
  * @NL80211_PMSR_FTM_REQ_ATTR_MAX: highest attribute number
@@ -6914,6 +6917,7 @@ enum nl80211_peer_measurement_ftm_req {
 	NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC,
 	NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED,
 	NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED,
+	NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK,
 
 	/* keep last */
 	NUM_NL80211_PMSR_FTM_REQ_ATTR,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index adfa07c67b44..aad19348bb46 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -309,6 +309,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = {
 	[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG },
 	[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG },
 	[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG },
+	[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG },
 };
 
 static const struct nla_policy
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index a95c79d18349..6bdd96408022 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2021 Intel Corporation
  */
 #ifndef __PMSR_H
 #define __PMSR_H
@@ -158,6 +158,16 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
 		return -EINVAL;
 	}
 
+	out->ftm.lmr_feedback =
+		!!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK];
+	if (!out->ftm.trigger_based && !out->ftm.non_trigger_based &&
+	    out->ftm.lmr_feedback) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK],
+				    "FTM: LMR feedback set for EDCA based ranging");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From f12ce9f607ffa5c617cd86cb7a7a0aaefe58f127 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Fri, 9 Apr 2021 12:40:15 +0300
Subject: nl80211: Add new RSNXE related nl80211 extended features

Draft P802.11ax_D2.5 defines the following capabilities that
can be negotiated using RSNXE capabilities:

- Secure LTF measurement exchange protocol.
- Secure RTT measurement exchange protocol.
- Management frame protection for all management frames exchanged
  during the negotiation and range measurement procedure.

Extend the nl80211 API to allow drivers to declare support for
these new capabilities as part of extended feature.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20210409123755.8280e31d8091.Ifcb29f84f432290338f80c8378aa5c9e0a390c93@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 00f696d177e6..f962c06e9818 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5940,6 +5940,16 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_BEACON_RATE_HE: Driver supports beacon rate
  *	configuration (AP/mesh) with HE rates.
  *
+ * @NL80211_EXT_FEATURE_SECURE_LTF: Device supports secure LTF measurement
+ *      exchange protocol.
+ *
+ * @NL80211_EXT_FEATURE_SECURE_RTT: Device supports secure RTT measurement
+ *      exchange protocol.
+ *
+ * @NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE: Device supports management
+ *      frame protection for all management frames exchanged during the
+ *      negotiation and range measurement procedure.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -6001,6 +6011,9 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_FILS_DISCOVERY,
 	NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP,
 	NL80211_EXT_FEATURE_BEACON_RATE_HE,
+	NL80211_EXT_FEATURE_SECURE_LTF,
+	NL80211_EXT_FEATURE_SECURE_RTT,
+	NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
-- 
cgit v1.2.3-71-gd317


From 55bcf6ef314ae8ba81bcd74aa760247b635ed47b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 12 Apr 2021 07:31:01 -0700
Subject: perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE

Current Hardware events and Hardware cache events have special perf
types, PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE. The two types don't
pass the PMU type in the user interface. For a hybrid system, the perf
subsystem doesn't know which PMU the events belong to. The first capable
PMU will always be assigned to the events. The events never get a chance
to run on the other capable PMUs.

Extend the two types to become PMU aware types. The PMU type ID is
stored at attr.config[63:32].

Add a new PMU capability, PERF_PMU_CAP_EXTENDED_HW_TYPE, to indicate a
PMU which supports the extended PERF_TYPE_HARDWARE and
PERF_TYPE_HW_CACHE.

The PMU type is only required when searching a specific PMU. The PMU
specific codes will only be interested in the 'real' config value, which
is stored in the low 32 bit of the event->attr.config. Update the
event->attr.config in the generic code, so the PMU specific codes don't
need to calculate it separately.

If a user specifies a PMU type, but the PMU doesn't support the extended
type, error out.

If an event cannot be initialized in a PMU specified by a user, error
out immediately. Perf should not try to open it on other PMUs.

The new PMU capability is only set for the X86 hybrid PMUs for now.
Other architectures, e.g., ARM, may need it as well. The support on ARM
may be implemented later separately.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-22-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/core.c          |  1 +
 include/linux/perf_event.h      | 19 ++++++++++---------
 include/uapi/linux/perf_event.h | 15 +++++++++++++++
 kernel/events/core.c            | 19 ++++++++++++++++---
 4 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 4f6595ee6359..3fe66b7aa721 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2173,6 +2173,7 @@ static int __init init_hw_perf_events(void)
 			hybrid_pmu->pmu.type = -1;
 			hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
 			hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS;
+			hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
 
 			err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
 						(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61b385154c4a..a763928a0e41 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -260,15 +260,16 @@ struct perf_event;
 /**
  * pmu::capabilities flags
  */
-#define PERF_PMU_CAP_NO_INTERRUPT		0x01
-#define PERF_PMU_CAP_NO_NMI			0x02
-#define PERF_PMU_CAP_AUX_NO_SG			0x04
-#define PERF_PMU_CAP_EXTENDED_REGS		0x08
-#define PERF_PMU_CAP_EXCLUSIVE			0x10
-#define PERF_PMU_CAP_ITRACE			0x20
-#define PERF_PMU_CAP_HETEROGENEOUS_CPUS		0x40
-#define PERF_PMU_CAP_NO_EXCLUDE			0x80
-#define PERF_PMU_CAP_AUX_OUTPUT			0x100
+#define PERF_PMU_CAP_NO_INTERRUPT		0x0001
+#define PERF_PMU_CAP_NO_NMI			0x0002
+#define PERF_PMU_CAP_AUX_NO_SG			0x0004
+#define PERF_PMU_CAP_EXTENDED_REGS		0x0008
+#define PERF_PMU_CAP_EXCLUSIVE			0x0010
+#define PERF_PMU_CAP_ITRACE			0x0020
+#define PERF_PMU_CAP_HETEROGENEOUS_CPUS		0x0040
+#define PERF_PMU_CAP_NO_EXCLUDE			0x0080
+#define PERF_PMU_CAP_AUX_OUTPUT			0x0100
+#define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0200
 
 struct perf_output_handle;
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 0b58970bab6b..e54e639248c8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -37,6 +37,21 @@ enum perf_type_id {
 	PERF_TYPE_MAX,				/* non-ABI */
 };
 
+/*
+ * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ * PERF_TYPE_HARDWARE:			0xEEEEEEEE000000AA
+ *					AA: hardware event ID
+ *					EEEEEEEE: PMU type ID
+ * PERF_TYPE_HW_CACHE:			0xEEEEEEEE00DDCCBB
+ *					BB: hardware cache ID
+ *					CC: hardware cache op ID
+ *					DD: hardware cache op result ID
+ *					EEEEEEEE: PMU type ID
+ * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied.
+ */
+#define PERF_PMU_TYPE_SHIFT		32
+#define PERF_HW_EVENT_MASK		0xffffffff
+
 /*
  * Generalized performance event event_id types, used by the
  * attr.event_id parameter of the sys_perf_event_open()
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6f0723c711a9..928b166d888e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11220,6 +11220,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 
 static struct pmu *perf_init_event(struct perf_event *event)
 {
+	bool extended_type = false;
 	int idx, type, ret;
 	struct pmu *pmu;
 
@@ -11238,16 +11239,27 @@ static struct pmu *perf_init_event(struct perf_event *event)
 	 * are often aliases for PERF_TYPE_RAW.
 	 */
 	type = event->attr.type;
-	if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
-		type = PERF_TYPE_RAW;
+	if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
+		type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
+		if (!type) {
+			type = PERF_TYPE_RAW;
+		} else {
+			extended_type = true;
+			event->attr.config &= PERF_HW_EVENT_MASK;
+		}
+	}
 
 again:
 	rcu_read_lock();
 	pmu = idr_find(&pmu_idr, type);
 	rcu_read_unlock();
 	if (pmu) {
+		if (event->attr.type != type && type != PERF_TYPE_RAW &&
+		    !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
+			goto fail;
+
 		ret = perf_try_init_event(pmu, event);
-		if (ret == -ENOENT && event->attr.type != type) {
+		if (ret == -ENOENT && event->attr.type != type && !extended_type) {
 			type = event->attr.type;
 			goto again;
 		}
@@ -11268,6 +11280,7 @@ again:
 			goto unlock;
 		}
 	}
+fail:
 	pmu = ERR_PTR(-ENOENT);
 unlock:
 	srcu_read_unlock(&pmus_srcu, idx);
-- 
cgit v1.2.3-71-gd317


From 7b15523a989b63927c2bb08e9b5b0bbc10b58bef Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Mon, 19 Apr 2021 17:52:40 +0200
Subject: bpf: Add a bpf_snprintf helper

The implementation takes inspiration from the existing bpf_trace_printk
helper but there are a few differences:

To allow for a large number of format-specifiers, parameters are
provided in an array, like in bpf_seq_printf.

Because the output string takes two arguments and the array of
parameters also takes two arguments, the format string needs to fit in
one argument. Thankfully, ARG_PTR_TO_CONST_STR is guaranteed to point to
a zero-terminated read-only map so we don't need a format string length
arg.

Because the format-string is known at verification time, we also do
a first pass of format string validation in the verifier logic. This
makes debugging easier.

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210419155243.1632274-4-revest@chromium.org
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 28 +++++++++++++++++++++++
 kernel/bpf/helpers.c           | 50 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 41 ++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c       |  2 ++
 tools/include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++
 6 files changed, 150 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c160526fc8bf..f8a45f109e96 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1953,6 +1953,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_proto;
 extern const struct bpf_func_proto bpf_snprintf_btf_proto;
+extern const struct bpf_func_proto bpf_snprintf_proto;
 extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index df164a44bb41..ec6d85a81744 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4708,6 +4708,33 @@ union bpf_attr {
  *	Return
  *		The number of traversed map elements for success, **-EINVAL** for
  *		invalid **flags**.
+ *
+ * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len)
+ *	Description
+ *		Outputs a string into the **str** buffer of size **str_size**
+ *		based on a format string stored in a read-only map pointed by
+ *		**fmt**.
+ *
+ *		Each format specifier in **fmt** corresponds to one u64 element
+ *		in the **data** array. For strings and pointers where pointees
+ *		are accessed, only the pointer values are stored in the *data*
+ *		array. The *data_len* is the size of *data* in bytes.
+ *
+ *		Formats **%s** and **%p{i,I}{4,6}** require to read kernel
+ *		memory. Reading kernel memory may fail due to either invalid
+ *		address or valid address but requiring a major memory fault. If
+ *		reading kernel memory fails, the string for **%s** will be an
+ *		empty string, and the ip address for **%p{i,I}{4,6}** will be 0.
+ *		Not returning error to bpf program is consistent with what
+ *		**bpf_trace_printk**\ () does for now.
+ *
+ *	Return
+ *		The strictly positive length of the formatted string, including
+ *		the trailing zero character. If the return value is greater than
+ *		**str_size**, **str** contains a truncated string, guaranteed to
+ *		be zero-terminated except when **str_size** is 0.
+ *
+ *		Or **-EBUSY** if the per-CPU memory copy buffer is busy.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4875,6 +4902,7 @@ union bpf_attr {
 	FN(sock_from_file),		\
 	FN(check_mtu),			\
 	FN(for_each_map_elem),		\
+	FN(snprintf),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9ca57eb1fc0d..85b26ca5aacd 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -925,6 +925,54 @@ out:
 	return err;
 }
 
+#define MAX_SNPRINTF_VARARGS		12
+
+BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
+	   const void *, data, u32, data_len)
+{
+	enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS];
+	u64 args[MAX_SNPRINTF_VARARGS];
+	int err, num_args;
+
+	if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 ||
+	    (data_len && !data))
+		return -EINVAL;
+	num_args = data_len / 8;
+
+	/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
+	 * can safely give an unbounded size.
+	 */
+	err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args);
+	if (err < 0)
+		return err;
+
+	/* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give
+	 * all of them to snprintf().
+	 */
+	err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod),
+		BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod),
+		BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod),
+		BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod),
+		BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod),
+		BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod),
+		BPF_CAST_FMT_ARG(11, args, mod));
+
+	bpf_printf_cleanup();
+
+	return err + 1;
+}
+
+const struct bpf_func_proto bpf_snprintf_proto = {
+	.func		= bpf_snprintf,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_CONST_STR,
+	.arg4_type	= ARG_PTR_TO_MEM_OR_NULL,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -1013,6 +1061,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_probe_read_kernel_str_proto;
 	case BPF_FUNC_snprintf_btf:
 		return &bpf_snprintf_btf_proto;
+	case BPF_FUNC_snprintf:
+		return &bpf_snprintf_proto;
 	default:
 		return NULL;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5f46dd6f3383..994ef36c5f60 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5918,6 +5918,41 @@ static int check_reference_leak(struct bpf_verifier_env *env)
 	return state->acquired_refs ? -EINVAL : 0;
 }
 
+static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *regs)
+{
+	struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
+	struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
+	struct bpf_map *fmt_map = fmt_reg->map_ptr;
+	int err, fmt_map_off, num_args;
+	u64 fmt_addr;
+	char *fmt;
+
+	/* data must be an array of u64 */
+	if (data_len_reg->var_off.value % 8)
+		return -EINVAL;
+	num_args = data_len_reg->var_off.value / 8;
+
+	/* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
+	 * and map_direct_value_addr is set.
+	 */
+	fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
+	err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
+						  fmt_map_off);
+	if (err)
+		return err;
+	fmt = (char *)(long)fmt_addr + fmt_map_off;
+
+	/* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
+	 * can focus on validating the format specifiers.
+	 */
+	err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args);
+	if (err < 0)
+		verbose(env, "Invalid format string\n");
+
+	return err;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx_p)
 {
@@ -6032,6 +6067,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_snprintf) {
+		err = check_bpf_snprintf_call(env, regs);
+		if (err < 0)
+			return err;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a13f8644b357..2a8bcdc927c7 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1076,6 +1076,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_task_storage_delete_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
+	case BPF_FUNC_snprintf:
+		return &bpf_snprintf_proto;
 	default:
 		return NULL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index df164a44bb41..ec6d85a81744 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4708,6 +4708,33 @@ union bpf_attr {
  *	Return
  *		The number of traversed map elements for success, **-EINVAL** for
  *		invalid **flags**.
+ *
+ * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len)
+ *	Description
+ *		Outputs a string into the **str** buffer of size **str_size**
+ *		based on a format string stored in a read-only map pointed by
+ *		**fmt**.
+ *
+ *		Each format specifier in **fmt** corresponds to one u64 element
+ *		in the **data** array. For strings and pointers where pointees
+ *		are accessed, only the pointer values are stored in the *data*
+ *		array. The *data_len* is the size of *data* in bytes.
+ *
+ *		Formats **%s** and **%p{i,I}{4,6}** require to read kernel
+ *		memory. Reading kernel memory may fail due to either invalid
+ *		address or valid address but requiring a major memory fault. If
+ *		reading kernel memory fails, the string for **%s** will be an
+ *		empty string, and the ip address for **%p{i,I}{4,6}** will be 0.
+ *		Not returning error to bpf program is consistent with what
+ *		**bpf_trace_printk**\ () does for now.
+ *
+ *	Return
+ *		The strictly positive length of the formatted string, including
+ *		the trailing zero character. If the return value is greater than
+ *		**str_size**, **str** contains a truncated string, guaranteed to
+ *		be zero-terminated except when **str_size** is 0.
+ *
+ *		Or **-EBUSY** if the per-CPU memory copy buffer is busy.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4875,6 +4902,7 @@ union bpf_attr {
 	FN(sock_from_file),		\
 	FN(check_mtu),			\
 	FN(for_each_map_elem),		\
+	FN(snprintf),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3-71-gd317


From fe7e948837f312d87853b3fce743795d1ae3715a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 12 Apr 2021 16:21:43 +1200
Subject: KVM: x86: Add capability to grant VM access to privileged SGX
 attribute

Add a capability, KVM_CAP_SGX_ATTRIBUTE, that can be used by userspace
to grant a VM access to a priveleged attribute, with args[0] holding a
file handle to a valid SGX attribute file.

The SGX subsystem restricts access to a subset of enclave attributes to
provide additional security for an uncompromised kernel, e.g. to prevent
malware from using the PROVISIONKEY to ensure its nodes are running
inside a geniune SGX enclave and/or to obtain a stable fingerprint.

To prevent userspace from circumventing such restrictions by running an
enclave in a VM, KVM restricts guest access to privileged attributes by
default.

Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <0b099d65e933e068e3ea934b0523bab070cb8cea.1618196135.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 23 +++++++++++++++++++++++
 arch/x86/kvm/cpuid.c           |  2 +-
 arch/x86/kvm/x86.c             | 21 +++++++++++++++++++++
 include/uapi/linux/kvm.h       |  1 +
 4 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 1b94cced0105..414b7fe1cf7b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6207,6 +6207,29 @@ KVM_RUN_BUS_LOCK flag is used to distinguish between them.
 This capability can be used to check / enable 2nd DAWR feature provided
 by POWER10 processor.
 
+7.25 KVM_CAP_SGX_ATTRIBUTE
+----------------------
+
+:Architectures: x86
+:Target: VM
+:Parameters: args[0] is a file handle of a SGX attribute file in securityfs
+:Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested
+          attribute is not supported by KVM.
+
+KVM_CAP_SGX_ATTRIBUTE enables a userspace VMM to grant a VM access to one or
+more priveleged enclave attributes.  args[0] must hold a file handle to a valid
+SGX attribute file corresponding to an attribute that is supported/restricted
+by KVM (currently only PROVISIONKEY).
+
+The SGX subsystem restricts access to a subset of enclave attributes to provide
+additional security for an uncompromised kernel, e.g. use of the PROVISIONKEY
+is restricted to deter malware from using the PROVISIONKEY to obtain a stable
+system fingerprint.  To prevent userspace from circumventing such restrictions
+by running an enclave in a VM, KVM prevents access to privileged attributes by
+default.
+
+See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 29365c11bf27..2ae061586677 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -849,7 +849,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		 * expected to derive it from supported XCR0.
 		 */
 		entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
-			      /* PROVISIONKEY | */ SGX_ATTR_EINITTOKENKEY |
+			      SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
 			      SGX_ATTR_KSS;
 		entry->ebx &= 0;
 		break;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d4063bec8ce3..d4cdab15238b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -75,6 +75,7 @@
 #include <asm/tlbflush.h>
 #include <asm/intel_pt.h>
 #include <asm/emulate_prefix.h>
+#include <asm/sgx.h>
 #include <clocksource/hyperv_timer.h>
 
 #define CREATE_TRACE_POINTS
@@ -3804,6 +3805,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_X86_USER_SPACE_MSR:
 	case KVM_CAP_X86_MSR_FILTER:
 	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+#ifdef CONFIG_X86_SGX_KVM
+	case KVM_CAP_SGX_ATTRIBUTE:
+#endif
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -5392,6 +5396,23 @@ split_irqchip_unlock:
 			kvm->arch.bus_lock_detection_enabled = true;
 		r = 0;
 		break;
+#ifdef CONFIG_X86_SGX_KVM
+	case KVM_CAP_SGX_ATTRIBUTE: {
+		unsigned long allowed_attributes = 0;
+
+		r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
+		if (r)
+			break;
+
+		/* KVM only supports the PROVISIONKEY privileged attribute. */
+		if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+		    !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+			kvm->arch.sgx_provisioning_allowed = true;
+		else
+			r = -EINVAL;
+		break;
+	}
+#endif
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 727010788eff..162b7e044c8b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1079,6 +1079,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
 #define KVM_CAP_PPC_DAWR1 194
 #define KVM_CAP_SET_GUEST_DEBUG2 195
+#define KVM_CAP_SGX_ATTRIBUTE 196
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-71-gd317


From ab50200ab04d105017b1bed8787f44b8725cb39a Mon Sep 17 00:00:00 2001
From: Denis Efremov <efremov@linux.com>
Date: Fri, 16 Apr 2021 11:34:45 +0300
Subject: floppy: cleanups: remove trailing whitespaces

Cleanup trailing whitespaces as checkpatch.pl suggests.

Signed-off-by: Denis Efremov <efremov@linux.com>
Link: https://lore.kernel.org/r/20210416083449.72700-2-efremov@linux.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/fd.h | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/fd.h b/include/uapi/linux/fd.h
index 8b80c63b971c..7022e3413dbc 100644
--- a/include/uapi/linux/fd.h
+++ b/include/uapi/linux/fd.h
@@ -49,11 +49,11 @@ struct floppy_struct {
 #define FDCLRPRM _IO(2, 0x41)
 /* clear user-defined parameters */
 
-#define FDSETPRM _IOW(2, 0x42, struct floppy_struct) 
+#define FDSETPRM _IOW(2, 0x42, struct floppy_struct)
 #define FDSETMEDIAPRM FDSETPRM
 /* set user-defined parameters for current media */
 
-#define FDDEFPRM _IOW(2, 0x43, struct floppy_struct) 
+#define FDDEFPRM _IOW(2, 0x43, struct floppy_struct)
 #define FDGETPRM _IOR(2, 0x04, struct floppy_struct)
 #define FDDEFMEDIAPRM FDDEFPRM
 #define FDGETMEDIAPRM FDGETPRM
@@ -65,7 +65,7 @@ struct floppy_struct {
 /* issue/don't issue kernel messages on media type change */
 
 
-/* 
+/*
  * Formatting (obsolete)
  */
 #define FD_FILL_BYTE 0xF6 /* format fill byte. */
@@ -126,13 +126,13 @@ typedef char floppy_drive_name[16];
  */
 struct floppy_drive_params {
 	signed char cmos;		/* CMOS type */
-	
-	/* Spec2 is (HLD<<1 | ND), where HLD is head load time (1=2ms, 2=4 ms 
+
+	/* Spec2 is (HLD<<1 | ND), where HLD is head load time (1=2ms, 2=4 ms
 	 * etc) and ND is set means no DMA. Hardcoded to 6 (HLD=6ms, use DMA).
 	 */
 	unsigned long max_dtr;		/* Step rate, usec */
 	unsigned long hlt;     		/* Head load/settle time, msec */
-	unsigned long hut;     		/* Head unload time (remnant of 
+	unsigned long hut;     		/* Head unload time (remnant of
 					 * 8" drives) */
 	unsigned long srt;     		/* Step rate, usec */
 
@@ -145,12 +145,12 @@ struct floppy_drive_params {
 	unsigned char rps;		/* rotations per second */
 	unsigned char tracks;		/* maximum number of tracks */
 	unsigned long timeout;		/* timeout for interrupt requests */
-	
-	unsigned char interleave_sect;	/* if there are more sectors, use 
+
+	unsigned char interleave_sect;	/* if there are more sectors, use
 					 * interleave */
-	
+
 	struct floppy_max_errors max_errors;
-	
+
 	char flags;			/* various flags, including ftd_msg */
 /*
  * Announce successful media type detection and media information loss after
@@ -162,7 +162,7 @@ struct floppy_drive_params {
 #define FD_BROKEN_DCL 0x20
 #define FD_DEBUG 0x02
 #define FD_SILENT_DCL_CLEAR 0x4
-#define FD_INVERTED_DCL 0x80 /* must be 0x80, because of hardware 
+#define FD_INVERTED_DCL 0x80 /* must be 0x80, because of hardware
 				considerations */
 
 	char read_track;		/* use readtrack during probing? */
@@ -176,8 +176,8 @@ struct floppy_drive_params {
 #define FD_AUTODETECT_SIZE 8
 
 	short autodetect[FD_AUTODETECT_SIZE]; /* autodetected formats */
-	
-	int checkfreq; /* how often should the drive be checked for disk 
+
+	int checkfreq; /* how often should the drive be checked for disk
 			* changes */
 	int native_format; /* native format of this drive */
 };
@@ -225,13 +225,13 @@ struct floppy_drive_struct {
  * decremented after each probe.
  */
 	int keep_data;
-	
+
 	/* Prevent "aliased" accesses. */
 	int fd_ref;
 	int fd_device;
-	unsigned long last_checked; /* when was the drive last checked for a disk 
+	unsigned long last_checked; /* when was the drive last checked for a disk
 			   * change? */
-	
+
 	char *dmabuf;
 	int bufblocks;
 };
@@ -255,7 +255,7 @@ enum reset_mode {
 /*
  * FDC state
  */
-struct floppy_fdc_state {	
+struct floppy_fdc_state {
 	int spec1;		/* spec1 value last used */
 	int spec2;		/* spec2 value last used */
 	int dtr;
@@ -302,16 +302,16 @@ struct floppy_write_errors {
 	 * to the user process are not counted.
 	 */
 
-	unsigned int write_errors;  /* number of physical write errors 
+	unsigned int write_errors;  /* number of physical write errors
 				     * encountered */
-	
+
 	/* position of first and last write errors */
 	unsigned long first_error_sector;
 	int           first_error_generation;
 	unsigned long last_error_sector;
 	int           last_error_generation;
-	
-	unsigned int badness; /* highest retry count for a read or write 
+
+	unsigned int badness; /* highest retry count for a read or write
 			       * operation */
 };
 
@@ -335,7 +335,7 @@ struct floppy_raw_cmd {
 #define FD_RAW_DISK_CHANGE 4 /* out: disk change flag was set */
 #define FD_RAW_INTR 8    /* wait for an interrupt */
 #define FD_RAW_SPIN 0x10 /* spin up the disk for this command */
-#define FD_RAW_NO_MOTOR_AFTER 0x20 /* switch the motor off after command 
+#define FD_RAW_NO_MOTOR_AFTER 0x20 /* switch the motor off after command
 				    * completion */
 #define FD_RAW_NEED_DISK 0x40  /* this command needs a disk to be present */
 #define FD_RAW_NEED_SEEK 0x80  /* this command uses an implied seek (soft) */
@@ -353,7 +353,7 @@ struct floppy_raw_cmd {
 
 	void __user *data;
 	char *kernel_data; /* location of data buffer in the kernel */
-	struct floppy_raw_cmd *next; /* used for chaining of raw cmd's 
+	struct floppy_raw_cmd *next; /* used for chaining of raw cmd's
 				      * within the kernel */
 	long length; /* in: length of dma transfer. out: remaining bytes */
 	long phys_length; /* physical length, if different from dma length */
-- 
cgit v1.2.3-71-gd317


From db2e718a47984b9d71ed890eb2ea36ecf150de18 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Tue, 20 Apr 2021 08:43:34 -0500
Subject: capabilities: require CAP_SETFCAP to map uid 0

cap_setfcap is required to create file capabilities.

Since commit 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"),
a process running as uid 0 but without cap_setfcap is able to work
around this as follows: unshare a new user namespace which maps parent
uid 0 into the child namespace.

While this task will not have new capabilities against the parent
namespace, there is a loophole due to the way namespaced file
capabilities are represented as xattrs.  File capabilities valid in
userns 1 are distinguished from file capabilities valid in userns 2 by
the kuid which underlies uid 0.  Therefore the restricted root process
can unshare a new self-mapping namespace, add a namespaced file
capability onto a file, then use that file capability in the parent
namespace.

To prevent that, do not allow mapping parent uid 0 if the process which
opened the uid_map file does not have CAP_SETFCAP, which is the
capability for setting file capabilities.

As a further wrinkle: a task can unshare its user namespace, then open
its uid_map file itself, and map (only) its own uid.  In this case we do
not have the credential from before unshare, which was potentially more
restricted.  So, when creating a user namespace, we record whether the
creator had CAP_SETFCAP.  Then we can use that during map_write().

With this patch:

1. Unprivileged user can still unshare -Ur

   ubuntu@caps:~$ unshare -Ur
   root@caps:~# logout

2. Root user can still unshare -Ur

   ubuntu@caps:~$ sudo bash
   root@caps:/home/ubuntu# unshare -Ur
   root@caps:/home/ubuntu# logout

3. Root user without CAP_SETFCAP cannot unshare -Ur:

   root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
   root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
   unable to set CAP_SETFCAP effective capability: Operation not permitted
   root@caps:/home/ubuntu# unshare -Ur
   unshare: write failed /proc/self/uid_map: Operation not permitted

Note: an alternative solution would be to allow uid 0 mappings by
processes without CAP_SETFCAP, but to prevent such a namespace from
writing any file capabilities.  This approach can be seen at [1].

Background history: commit 95ebabde382 ("capabilities: Don't allow
writing ambiguous v3 file capabilities") tried to fix the issue by
preventing v3 fscaps to be written to disk when the root uid would map
to the same uid in nested user namespaces.  This led to regressions for
various workloads.  For example, see [2].  Ultimately this is a valid
use-case we have to support meaning we had to revert this change in
3b0c2d3eaa83 ("Revert 95ebabde382c ("capabilities: Don't allow writing
ambiguous v3 file capabilities")").

Link: https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4 [1]
Link: https://github.com/containers/buildah/issues/3071 [2]
Signed-off-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Andrew G. Morgan <morgan@kernel.org>
Tested-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Tested-by: Giuseppe Scrivano <gscrivan@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/user_namespace.h  |  3 ++
 include/uapi/linux/capability.h |  3 +-
 kernel/user_namespace.c         | 65 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 67 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 64cf8ebdc4ec..f6c5f784be5a 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -63,6 +63,9 @@ struct user_namespace {
 	kgid_t			group;
 	struct ns_common	ns;
 	unsigned long		flags;
+	/* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
+	 * in its effective capability set at the child ns creation time. */
+	bool			parent_could_setfcap;
 
 #ifdef CONFIG_KEYS
 	/* List of joinable keyrings in this namespace.  Modification access of
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index c6ca33034147..2ddb4226cd23 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -335,7 +335,8 @@ struct vfs_ns_cap_data {
 
 #define CAP_AUDIT_CONTROL    30
 
-/* Set or remove capabilities on files */
+/* Set or remove capabilities on files.
+   Map uid=0 into a child user namespace. */
 
 #define CAP_SETFCAP	     31
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index af612945a4d0..9a4b980d695b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
 	if (!ns)
 		goto fail_dec;
 
+	ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
 	ret = ns_alloc_inum(&ns->ns);
 	if (ret)
 		goto fail_free;
@@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map)
 	return 0;
 }
 
+/**
+ * verify_root_map() - check the uid 0 mapping
+ * @file: idmapping file
+ * @map_ns: user namespace of the target process
+ * @new_map: requested idmap
+ *
+ * If a process requests mapping parent uid 0 into the new ns, verify that the
+ * process writing the map had the CAP_SETFCAP capability as the target process
+ * will be able to write fscaps that are valid in ancestor user namespaces.
+ *
+ * Return: true if the mapping is allowed, false if not.
+ */
+static bool verify_root_map(const struct file *file,
+			    struct user_namespace *map_ns,
+			    struct uid_gid_map *new_map)
+{
+	int idx;
+	const struct user_namespace *file_ns = file->f_cred->user_ns;
+	struct uid_gid_extent *extent0 = NULL;
+
+	for (idx = 0; idx < new_map->nr_extents; idx++) {
+		if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+			extent0 = &new_map->extent[idx];
+		else
+			extent0 = &new_map->forward[idx];
+		if (extent0->lower_first == 0)
+			break;
+
+		extent0 = NULL;
+	}
+
+	if (!extent0)
+		return true;
+
+	if (map_ns == file_ns) {
+		/* The process unshared its ns and is writing to its own
+		 * /proc/self/uid_map.  User already has full capabilites in
+		 * the new namespace.  Verify that the parent had CAP_SETFCAP
+		 * when it unshared.
+		 * */
+		if (!file_ns->parent_could_setfcap)
+			return false;
+	} else {
+		/* Process p1 is writing to uid_map of p2, who is in a child
+		 * user namespace to p1's.  Verify that the opener of the map
+		 * file has CAP_SETFCAP against the parent of the new map
+		 * namespace */
+		if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
+			return false;
+	}
+
+	return true;
+}
+
 static ssize_t map_write(struct file *file, const char __user *buf,
 			 size_t count, loff_t *ppos,
 			 int cap_setid,
@@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 			 struct uid_gid_map *parent_map)
 {
 	struct seq_file *seq = file->private_data;
-	struct user_namespace *ns = seq->private;
+	struct user_namespace *map_ns = seq->private;
 	struct uid_gid_map new_map;
 	unsigned idx;
 	struct uid_gid_extent extent;
@@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	/*
 	 * Adjusting namespace settings requires capabilities on the target.
 	 */
-	if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
+	if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
 		goto out;
 
 	/* Parse the user data */
@@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 
 	ret = -EPERM;
 	/* Validate the user is allowed to use user id's mapped to. */
-	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
+	if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
 		goto out;
 
 	ret = -EPERM;
@@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file,
 				struct uid_gid_map *new_map)
 {
 	const struct cred *cred = file->f_cred;
+
+	if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
+		return false;
+
 	/* Don't allow mappings that would allow anything that wouldn't
 	 * be allowed without the establishment of unprivileged mappings.
 	 */
-- 
cgit v1.2.3-71-gd317


From 54526d1fd59338fd6a381dbd806b7ccbae3aa4aa Mon Sep 17 00:00:00 2001
From: Nathan Tempelman <natet@google.com>
Date: Thu, 8 Apr 2021 22:32:14 +0000
Subject: KVM: x86: Support KVM VMs sharing SEV context

Add a capability for userspace to mirror SEV encryption context from
one vm to another. On our side, this is intended to support a
Migration Helper vCPU, but it can also be used generically to support
other in-guest workloads scheduled by the host. The intention is for
the primary guest and the mirror to have nearly identical memslots.

The primary benefits of this are that:
1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they
can't accidentally clobber each other.
2) The VMs can have different memory-views, which is necessary for post-copy
migration (the migration vCPUs on the target need to read and write to
pages, when the primary guest would VMEXIT).

This does not change the threat model for AMD SEV. Any memory involved
is still owned by the primary guest and its initial state is still
attested to through the normal SEV_LAUNCH_* flows. If userspace wanted
to circumvent SEV, they could achieve the same effect by simply attaching
a vCPU to the primary VM.
This patch deliberately leaves userspace in charge of the memslots for the
mirror, as it already has the power to mess with them in the primary guest.

This patch does not support SEV-ES (much less SNP), as it does not
handle handing off attested VMSAs to the mirror.

For additional context, we need a Migration Helper because SEV PSP
migration is far too slow for our live migration on its own. Using
an in-guest migrator lets us speed this up significantly.

Signed-off-by: Nathan Tempelman <natet@google.com>
Message-Id: <20210408223214.2582277-1-natet@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  | 18 ++++++++-
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm/sev.c          | 90 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c          |  2 +
 arch/x86/kvm/svm/svm.h          |  2 +
 arch/x86/kvm/x86.c              |  7 +++-
 include/linux/kvm_host.h        |  1 +
 include/uapi/linux/kvm.h        |  1 +
 virt/kvm/kvm_main.c             |  6 +++
 9 files changed, 126 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 414b7fe1cf7b..fd4a84911355 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6207,6 +6207,22 @@ KVM_RUN_BUS_LOCK flag is used to distinguish between them.
 This capability can be used to check / enable 2nd DAWR feature provided
 by POWER10 processor.
 
+7.24 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM
+-------------------------------------
+
+Architectures: x86 SEV enabled
+Type: vm
+Parameters: args[0] is the fd of the source vm
+Returns: 0 on success; ENOTTY on error
+
+This capability enables userspace to copy encryption context from the vm
+indicated by the fd to the vm this is called on.
+
+This is intended to support in-guest workloads scheduled by the host. This
+allows the in-guest workload to maintain its own NPTs and keeps the two vms
+from accidentally clobbering each other with interrupts and the like (separate
+APIC/MSRs/etc).
+
 7.25 KVM_CAP_SGX_ATTRIBUTE
 ----------------------
 
@@ -6749,4 +6765,4 @@ in the kernel based fast path. If they can not be handled by the kernel,
 they will get passed on to user space. So user space still has to have
 an implementation for these despite the in kernel acceleration.
 
-This capability is always enabled.
\ No newline at end of file
+This capability is always enabled.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c5c96386cc76..6e195f7df4f0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1349,6 +1349,7 @@ struct kvm_x86_ops {
 	int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
 	int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
 	int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+	int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
 
 	int (*get_msr_feature)(struct kvm_msr_entry *entry);
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index bb4bf5ffb104..4bf79dbc3eeb 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -66,6 +66,11 @@ static int sev_flush_asids(void)
 	return ret;
 }
 
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+	return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
 /* Must be called with the sev_bitmap_lock held */
 static bool __sev_recycle_asids(int min_asid, int max_asid)
 {
@@ -1122,6 +1127,12 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 
 	mutex_lock(&kvm->lock);
 
+	/* enc_context_owner handles all memory enc operations */
+	if (is_mirroring_enc_context(kvm)) {
+		r = -EINVAL;
+		goto out;
+	}
+
 	switch (sev_cmd.id) {
 	case KVM_SEV_ES_INIT:
 		if (!sev_es) {
@@ -1185,6 +1196,10 @@ int svm_register_enc_region(struct kvm *kvm,
 	if (!sev_guest(kvm))
 		return -ENOTTY;
 
+	/* If kvm is mirroring encryption context it isn't responsible for it */
+	if (is_mirroring_enc_context(kvm))
+		return -EINVAL;
+
 	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
 		return -EINVAL;
 
@@ -1251,6 +1266,10 @@ int svm_unregister_enc_region(struct kvm *kvm,
 	struct enc_region *region;
 	int ret;
 
+	/* If kvm is mirroring encryption context it isn't responsible for it */
+	if (is_mirroring_enc_context(kvm))
+		return -EINVAL;
+
 	mutex_lock(&kvm->lock);
 
 	if (!sev_guest(kvm)) {
@@ -1281,6 +1300,71 @@ failed:
 	return ret;
 }
 
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+{
+	struct file *source_kvm_file;
+	struct kvm *source_kvm;
+	struct kvm_sev_info *mirror_sev;
+	unsigned int asid;
+	int ret;
+
+	source_kvm_file = fget(source_fd);
+	if (!file_is_kvm(source_kvm_file)) {
+		ret = -EBADF;
+		goto e_source_put;
+	}
+
+	source_kvm = source_kvm_file->private_data;
+	mutex_lock(&source_kvm->lock);
+
+	if (!sev_guest(source_kvm)) {
+		ret = -EINVAL;
+		goto e_source_unlock;
+	}
+
+	/* Mirrors of mirrors should work, but let's not get silly */
+	if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+		ret = -EINVAL;
+		goto e_source_unlock;
+	}
+
+	asid = to_kvm_svm(source_kvm)->sev_info.asid;
+
+	/*
+	 * The mirror kvm holds an enc_context_owner ref so its asid can't
+	 * disappear until we're done with it
+	 */
+	kvm_get_kvm(source_kvm);
+
+	fput(source_kvm_file);
+	mutex_unlock(&source_kvm->lock);
+	mutex_lock(&kvm->lock);
+
+	if (sev_guest(kvm)) {
+		ret = -EINVAL;
+		goto e_mirror_unlock;
+	}
+
+	/* Set enc_context_owner and copy its encryption context over */
+	mirror_sev = &to_kvm_svm(kvm)->sev_info;
+	mirror_sev->enc_context_owner = source_kvm;
+	mirror_sev->asid = asid;
+	mirror_sev->active = true;
+
+	mutex_unlock(&kvm->lock);
+	return 0;
+
+e_mirror_unlock:
+	mutex_unlock(&kvm->lock);
+	kvm_put_kvm(source_kvm);
+	return ret;
+e_source_unlock:
+	mutex_unlock(&source_kvm->lock);
+e_source_put:
+	fput(source_kvm_file);
+	return ret;
+}
+
 void sev_vm_destroy(struct kvm *kvm)
 {
 	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -1290,6 +1374,12 @@ void sev_vm_destroy(struct kvm *kvm)
 	if (!sev_guest(kvm))
 		return;
 
+	/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+	if (is_mirroring_enc_context(kvm)) {
+		kvm_put_kvm(sev->enc_context_owner);
+		return;
+	}
+
 	mutex_lock(&kvm->lock);
 
 	/*
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 86e42d42637e..cd8c333ed2dc 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4589,6 +4589,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.mem_enc_reg_region = svm_register_enc_region,
 	.mem_enc_unreg_region = svm_unregister_enc_region,
 
+	.vm_copy_enc_context_from = svm_vm_copy_asid_from,
+
 	.can_emulate_instruction = svm_can_emulate_instruction,
 
 	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 7deb7a057004..454da1c1d9b7 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -68,6 +68,7 @@ struct kvm_sev_info {
 	unsigned long pages_locked; /* Number of pages locked */
 	struct list_head regions_list;  /* List of registered regions */
 	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
+	struct kvm *enc_context_owner; /* Owner of copied encryption context */
 };
 
 struct kvm_svm {
@@ -580,6 +581,7 @@ int svm_register_enc_region(struct kvm *kvm,
 			    struct kvm_enc_region *range);
 int svm_unregister_enc_region(struct kvm *kvm,
 			      struct kvm_enc_region *range);
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
 void pre_sev_run(struct vcpu_svm *svm, int cpu);
 void __init sev_hardware_setup(void);
 void sev_hardware_teardown(void);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d4cdab15238b..a5eeca55810f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3808,6 +3808,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_X86_SGX_KVM
 	case KVM_CAP_SGX_ATTRIBUTE:
 #endif
+	case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -4714,7 +4715,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 			kvm_update_pv_runtime(vcpu);
 
 		return 0;
-
 	default:
 		return -EINVAL;
 	}
@@ -5413,6 +5413,11 @@ split_irqchip_unlock:
 		break;
 	}
 #endif
+	case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+		r = -EINVAL;
+		if (kvm_x86_ops.vm_copy_enc_context_from)
+			r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
+		return r;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 82b066db37cb..c306b4c3674e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -654,6 +654,7 @@ void kvm_exit(void);
 
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
+bool file_is_kvm(struct file *file);
 void kvm_put_kvm_no_destroy(struct kvm *kvm);
 
 static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 162b7e044c8b..37f0a329da6a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1080,6 +1080,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_DAWR1 194
 #define KVM_CAP_SET_GUEST_DEBUG2 195
 #define KVM_CAP_SGX_ATTRIBUTE 196
+#define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 545a28387c33..48c1e4842908 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4197,6 +4197,12 @@ static struct file_operations kvm_vm_fops = {
 	KVM_COMPAT(kvm_vm_compat_ioctl),
 };
 
+bool file_is_kvm(struct file *file)
+{
+	return file && file->f_op == &kvm_vm_fops;
+}
+EXPORT_SYMBOL_GPL(file_is_kvm);
+
 static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
 	int r;
-- 
cgit v1.2.3-71-gd317


From 4cfdd47d6d95aca4fb8d6cfbe73392472d353f82 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 15 Apr 2021 15:53:14 +0000
Subject: KVM: SVM: Add KVM_SEV SEND_START command

The command is used to create an outgoing SEV guest encryption context.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Steve Rutherford <srutherford@google.com>
Reviewed-by: Venu Busireddy <venu.busireddy@oracle.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Message-Id: <2f1686d0164e0f1b3d6a41d620408393e0a48376.1618498113.git.ashish.kalra@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/amd-memory-encryption.rst |  30 ++++++
 arch/x86/kvm/svm/sev.c                           | 128 +++++++++++++++++++++++
 include/linux/psp-sev.h                          |   8 +-
 include/uapi/linux/kvm.h                         |  12 +++
 4 files changed, 174 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst
index 34ce2d1fcb89..db5c3fb2bab5 100644
--- a/Documentation/virt/kvm/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/amd-memory-encryption.rst
@@ -290,6 +290,36 @@ Returns: 0 on success, -negative on error
                 __u32 len;
         };
 
+11. KVM_SEV_SEND_START
+----------------------
+
+The KVM_SEV_SEND_START command can be used by the hypervisor to create an
+outgoing guest encryption context.
+
+If session_len is zero on entry, the length of the guest session information is
+written to session_len and all other fields are not used.
+
+Parameters (in): struct kvm_sev_send_start
+
+Returns: 0 on success, -negative on error
+
+::
+        struct kvm_sev_send_start {
+                __u32 policy;                 /* guest policy */
+
+                __u64 pdh_cert_uaddr;         /* platform Diffie-Hellman certificate */
+                __u32 pdh_cert_len;
+
+                __u64 plat_certs_uaddr;        /* platform certificate chain */
+                __u32 plat_certs_len;
+
+                __u64 amd_certs_uaddr;        /* AMD certificate */
+                __u32 amd_certs_len;
+
+                __u64 session_uaddr;          /* Guest session information */
+                __u32 session_len;
+        };
+
 References
 ==========
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 4bf79dbc3eeb..9f5312956d5e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1111,6 +1111,131 @@ e_free:
 	return ret;
 }
 
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+				      struct kvm_sev_send_start *params)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_send_start *data;
+	int ret;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+	if (data == NULL)
+		return -ENOMEM;
+
+	data->handle = sev->handle;
+	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, data, &argp->error);
+	if (ret < 0)
+		goto out;
+
+	params->session_len = data->session_len;
+	if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+				sizeof(struct kvm_sev_send_start)))
+		ret = -EFAULT;
+
+out:
+	kfree(data);
+	return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_send_start *data;
+	struct kvm_sev_send_start params;
+	void *amd_certs, *session_data;
+	void *pdh_cert, *plat_certs;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+				sizeof(struct kvm_sev_send_start)))
+		return -EFAULT;
+
+	/* if session_len is zero, userspace wants to query the session length */
+	if (!params.session_len)
+		return __sev_send_start_query_session_length(kvm, argp,
+				&params);
+
+	/* some sanity checks */
+	if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+	    !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+		return -EINVAL;
+
+	/* allocate the memory to hold the session data blob */
+	session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+	if (!session_data)
+		return -ENOMEM;
+
+	/* copy the certificate blobs from userspace */
+	pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+				params.pdh_cert_len);
+	if (IS_ERR(pdh_cert)) {
+		ret = PTR_ERR(pdh_cert);
+		goto e_free_session;
+	}
+
+	plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+				params.plat_certs_len);
+	if (IS_ERR(plat_certs)) {
+		ret = PTR_ERR(plat_certs);
+		goto e_free_pdh;
+	}
+
+	amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+				params.amd_certs_len);
+	if (IS_ERR(amd_certs)) {
+		ret = PTR_ERR(amd_certs);
+		goto e_free_plat_cert;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+	if (data == NULL) {
+		ret = -ENOMEM;
+		goto e_free_amd_cert;
+	}
+
+	/* populate the FW SEND_START field with system physical address */
+	data->pdh_cert_address = __psp_pa(pdh_cert);
+	data->pdh_cert_len = params.pdh_cert_len;
+	data->plat_certs_address = __psp_pa(plat_certs);
+	data->plat_certs_len = params.plat_certs_len;
+	data->amd_certs_address = __psp_pa(amd_certs);
+	data->amd_certs_len = params.amd_certs_len;
+	data->session_address = __psp_pa(session_data);
+	data->session_len = params.session_len;
+	data->handle = sev->handle;
+
+	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, data, &argp->error);
+
+	if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+			session_data, params.session_len)) {
+		ret = -EFAULT;
+		goto e_free;
+	}
+
+	params.policy = data->policy;
+	params.session_len = data->session_len;
+	if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+				sizeof(struct kvm_sev_send_start)))
+		ret = -EFAULT;
+
+e_free:
+	kfree(data);
+e_free_amd_cert:
+	kfree(amd_certs);
+e_free_plat_cert:
+	kfree(plat_certs);
+e_free_pdh:
+	kfree(pdh_cert);
+e_free_session:
+	kfree(session_data);
+	return ret;
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -1173,6 +1298,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_GET_ATTESTATION_REPORT:
 		r = sev_get_attestation_report(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_SEND_START:
+		r = sev_send_start(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index b801ead1e2bb..73da511b9423 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -326,11 +326,11 @@ struct sev_data_send_start {
 	u64 pdh_cert_address;			/* In */
 	u32 pdh_cert_len;			/* In */
 	u32 reserved1;
-	u64 plat_cert_address;			/* In */
-	u32 plat_cert_len;			/* In */
+	u64 plat_certs_address;			/* In */
+	u32 plat_certs_len;			/* In */
 	u32 reserved2;
-	u64 amd_cert_address;			/* In */
-	u32 amd_cert_len;			/* In */
+	u64 amd_certs_address;			/* In */
+	u32 amd_certs_len;			/* In */
 	u32 reserved3;
 	u64 session_address;			/* In */
 	u32 session_len;			/* In/Out */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 37f0a329da6a..11b3e528d35d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1732,6 +1732,18 @@ struct kvm_sev_attestation_report {
 	__u32 len;
 };
 
+struct kvm_sev_send_start {
+	__u32 policy;
+	__u64 pdh_cert_uaddr;
+	__u32 pdh_cert_len;
+	__u64 plat_certs_uaddr;
+	__u32 plat_certs_len;
+	__u64 amd_certs_uaddr;
+	__u32 amd_certs_len;
+	__u64 session_uaddr;
+	__u32 session_len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-- 
cgit v1.2.3-71-gd317


From d3d1af85e2c75bb57da51535a6e182c7c45eceb0 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 15 Apr 2021 15:53:55 +0000
Subject: KVM: SVM: Add KVM_SEND_UPDATE_DATA command

The command is used for encrypting the guest memory region using the encryption
context created with KVM_SEV_SEND_START.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by : Steve Rutherford <srutherford@google.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Message-Id: <d6a6ea740b0c668b30905ae31eac5ad7da048bb3.1618498113.git.ashish.kalra@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/amd-memory-encryption.rst |  28 +++++
 arch/x86/kvm/svm/sev.c                           | 125 +++++++++++++++++++++++
 include/uapi/linux/kvm.h                         |   9 ++
 3 files changed, 162 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst
index db5c3fb2bab5..79da88a14135 100644
--- a/Documentation/virt/kvm/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/amd-memory-encryption.rst
@@ -320,6 +320,34 @@ Returns: 0 on success, -negative on error
                 __u32 session_len;
         };
 
+12. KVM_SEV_SEND_UPDATE_DATA
+----------------------------
+
+The KVM_SEV_SEND_UPDATE_DATA command can be used by the hypervisor to encrypt the
+outgoing guest memory region with the encryption context creating using
+KVM_SEV_SEND_START.
+
+If hdr_len or trans_len are zero on entry, the length of the packet header and
+transport region are written to hdr_len and trans_len respectively, and all
+other fields are not used.
+
+Parameters (in): struct kvm_sev_send_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_send_update_data {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the source memory region to be encrypted */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the destination memory region  */
+                __u32 trans_len;
+        };
+
 References
 ==========
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 9f5312956d5e..a534ec9ae172 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -34,6 +34,7 @@ static DECLARE_RWSEM(sev_deactivate_lock);
 static DEFINE_MUTEX(sev_bitmap_lock);
 unsigned int max_sev_asid;
 static unsigned int min_sev_asid;
+static unsigned long sev_me_mask;
 static unsigned long *sev_asid_bitmap;
 static unsigned long *sev_reclaim_asid_bitmap;
 
@@ -1236,6 +1237,126 @@ e_free_session:
 	return ret;
 }
 
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+				     struct kvm_sev_send_update_data *params)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_send_update_data *data;
+	int ret;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+	if (!data)
+		return -ENOMEM;
+
+	data->handle = sev->handle;
+	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, data, &argp->error);
+	if (ret < 0)
+		goto out;
+
+	params->hdr_len = data->hdr_len;
+	params->trans_len = data->trans_len;
+
+	if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+			 sizeof(struct kvm_sev_send_update_data)))
+		ret = -EFAULT;
+
+out:
+	kfree(data);
+	return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_send_update_data *data;
+	struct kvm_sev_send_update_data params;
+	void *hdr, *trans_data;
+	struct page **guest_page;
+	unsigned long n;
+	int ret, offset;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+			sizeof(struct kvm_sev_send_update_data)))
+		return -EFAULT;
+
+	/* userspace wants to query either header or trans length */
+	if (!params.trans_len || !params.hdr_len)
+		return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+	if (!params.trans_uaddr || !params.guest_uaddr ||
+	    !params.guest_len || !params.hdr_uaddr)
+		return -EINVAL;
+
+	/* Check if we are crossing the page boundary */
+	offset = params.guest_uaddr & (PAGE_SIZE - 1);
+	if ((params.guest_len + offset > PAGE_SIZE))
+		return -EINVAL;
+
+	/* Pin guest memory */
+	guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+				    PAGE_SIZE, &n, 0);
+	if (!guest_page)
+		return -EFAULT;
+
+	/* allocate memory for header and transport buffer */
+	ret = -ENOMEM;
+	hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+	if (!hdr)
+		goto e_unpin;
+
+	trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+	if (!trans_data)
+		goto e_free_hdr;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto e_free_trans_data;
+
+	data->hdr_address = __psp_pa(hdr);
+	data->hdr_len = params.hdr_len;
+	data->trans_address = __psp_pa(trans_data);
+	data->trans_len = params.trans_len;
+
+	/* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+	data->guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) +
+				offset;
+	data->guest_address |= sev_me_mask;
+	data->guest_len = params.guest_len;
+	data->handle = sev->handle;
+
+	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, data, &argp->error);
+
+	if (ret)
+		goto e_free;
+
+	/* copy transport buffer to user space */
+	if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+			 trans_data, params.trans_len)) {
+		ret = -EFAULT;
+		goto e_free;
+	}
+
+	/* Copy packet header to userspace. */
+	ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+				params.hdr_len);
+
+e_free:
+	kfree(data);
+e_free_trans_data:
+	kfree(trans_data);
+e_free_hdr:
+	kfree(hdr);
+e_unpin:
+	sev_unpin_memory(kvm, guest_page, n);
+
+	return ret;
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -1301,6 +1422,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_SEND_START:
 		r = sev_send_start(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_SEND_UPDATE_DATA:
+		r = sev_send_update_data(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
@@ -1559,6 +1683,7 @@ void __init sev_hardware_setup(void)
 
 	/* Minimum ASID value that should be used for SEV guest */
 	min_sev_asid = edx;
+	sev_me_mask = 1UL << (ebx & 0x3f);
 
 	/* Initialize SEV ASID bitmaps */
 	sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 11b3e528d35d..d07be69f402f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1744,6 +1744,15 @@ struct kvm_sev_send_start {
 	__u32 session_len;
 };
 
+struct kvm_sev_send_update_data {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 guest_uaddr;
+	__u32 guest_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-- 
cgit v1.2.3-71-gd317


From 5569e2e7a650dfffd4df7635662b2f92162d6501 Mon Sep 17 00:00:00 2001
From: Steve Rutherford <srutherford@google.com>
Date: Tue, 20 Apr 2021 05:01:20 -0400
Subject: KVM: SVM: Add support for KVM_SEV_SEND_CANCEL command

After completion of SEND_START, but before SEND_FINISH, the source VMM can
issue the SEND_CANCEL command to stop a migration. This is necessary so
that a cancelled migration can restart with a new target later.

Reviewed-by: Nathan Tempelman <natet@google.com>
Reviewed-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Steve Rutherford <srutherford@google.com>
Message-Id: <20210412194408.2458827-1-srutherford@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/amd-memory-encryption.rst |  9 +++++++++
 arch/x86/kvm/svm/sev.c                           | 23 +++++++++++++++++++++++
 drivers/crypto/ccp/sev-dev.c                     |  1 +
 include/linux/psp-sev.h                          | 10 ++++++++++
 include/uapi/linux/kvm.h                         |  2 ++
 5 files changed, 45 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst
index 03f2518cfbeb..c36a12975763 100644
--- a/Documentation/virt/kvm/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/amd-memory-encryption.rst
@@ -356,6 +356,15 @@ issued by the hypervisor to delete the encryption context.
 
 Returns: 0 on success, -negative on error
 
+14. KVM_SEV_SEND_CANCEL
+------------------------
+
+After completion of SEND_START, but before SEND_FINISH, the source VMM can issue the
+SEND_CANCEL command to stop a migration. This is necessary so that a cancelled
+migration can restart with a new target later.
+
+Returns: 0 on success, -negative on error
+
 References
 ==========
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index f25c52a2dc14..552f47787143 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1377,6 +1377,26 @@ static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_send_cancel *data;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->handle = sev->handle;
+	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, data, &argp->error);
+
+	kfree(data);
+	return ret;
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -1448,6 +1468,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_SEND_FINISH:
 		r = sev_send_finish(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_SEND_CANCEL:
+		r = sev_send_cancel(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index cb9b4c4e371e..4172a1afa0db 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -129,6 +129,7 @@ static int sev_cmd_buffer_len(int cmd)
 	case SEV_CMD_DOWNLOAD_FIRMWARE:		return sizeof(struct sev_data_download_firmware);
 	case SEV_CMD_GET_ID:			return sizeof(struct sev_data_get_id);
 	case SEV_CMD_ATTESTATION_REPORT:	return sizeof(struct sev_data_attestation_report);
+	case SEV_CMD_SEND_CANCEL:			return sizeof(struct sev_data_send_cancel);
 	default:				return 0;
 	}
 
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 73da511b9423..d48a7192e881 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -73,6 +73,7 @@ enum sev_cmd {
 	SEV_CMD_SEND_UPDATE_DATA	= 0x041,
 	SEV_CMD_SEND_UPDATE_VMSA	= 0x042,
 	SEV_CMD_SEND_FINISH		= 0x043,
+	SEV_CMD_SEND_CANCEL		= 0x044,
 
 	/* Guest migration commands (incoming) */
 	SEV_CMD_RECEIVE_START		= 0x050,
@@ -392,6 +393,15 @@ struct sev_data_send_finish {
 	u32 handle;				/* In */
 } __packed;
 
+/**
+ * struct sev_data_send_cancel - SEND_CANCEL command parameters
+ *
+ * @handle: handle of the VM to process
+ */
+struct sev_data_send_cancel {
+	u32 handle;				/* In */
+} __packed;
+
 /**
  * struct sev_data_receive_start - RECEIVE_START command parameters
  *
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d07be69f402f..5d1adeb13739 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1674,6 +1674,8 @@ enum sev_cmd_id {
 	KVM_SEV_CERT_EXPORT,
 	/* Attestation report */
 	KVM_SEV_GET_ATTESTATION_REPORT,
+	/* Guest Migration Extension */
+	KVM_SEV_SEND_CANCEL,
 
 	KVM_SEV_NR_MAX,
 };
-- 
cgit v1.2.3-71-gd317


From af43cbbf954b50ca97d5e7bb56c2edc6ffd209ef Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 15 Apr 2021 15:54:50 +0000
Subject: KVM: SVM: Add support for KVM_SEV_RECEIVE_START command

The command is used to create the encryption context for an incoming
SEV guest. The encryption context can be later used by the hypervisor
to import the incoming data into the SEV guest memory space.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Steve Rutherford <srutherford@google.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Message-Id: <c7400111ed7458eee01007c4d8d57cdf2cbb0fc2.1618498113.git.ashish.kalra@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/amd-memory-encryption.rst | 29 +++++++++
 arch/x86/kvm/svm/sev.c                           | 81 ++++++++++++++++++++++++
 include/uapi/linux/kvm.h                         |  9 +++
 3 files changed, 119 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst
index c36a12975763..86c9b36f4a57 100644
--- a/Documentation/virt/kvm/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/amd-memory-encryption.rst
@@ -365,6 +365,35 @@ migration can restart with a new target later.
 
 Returns: 0 on success, -negative on error
 
+15. KVM_SEV_RECEIVE_START
+------------------------
+
+The KVM_SEV_RECEIVE_START command is used for creating the memory encryption
+context for an incoming SEV guest. To create the encryption context, the user must
+provide a guest policy, the platform public Diffie-Hellman (PDH) key and session
+information.
+
+Parameters: struct  kvm_sev_receive_start (in/out)
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_receive_start {
+                __u32 handle;           /* if zero then firmware creates a new handle */
+                __u32 policy;           /* guest's policy */
+
+                __u64 pdh_uaddr;        /* userspace address pointing to the PDH key */
+                __u32 pdh_len;
+
+                __u64 session_uaddr;    /* userspace address which points to the guest session information */
+                __u32 session_len;
+        };
+
+On success, the 'handle' field contains a new handle and on error, a negative value.
+
+For more details, see SEV spec Section 6.12.
+
 References
 ==========
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 552f47787143..3e5064c69124 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1397,6 +1397,84 @@ static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_receive_start *start;
+	struct kvm_sev_receive_start params;
+	int *error = &argp->error;
+	void *session_data;
+	void *pdh_data;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	/* Get parameter from the userspace */
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+			sizeof(struct kvm_sev_receive_start)))
+		return -EFAULT;
+
+	/* some sanity checks */
+	if (!params.pdh_uaddr || !params.pdh_len ||
+	    !params.session_uaddr || !params.session_len)
+		return -EINVAL;
+
+	pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+	if (IS_ERR(pdh_data))
+		return PTR_ERR(pdh_data);
+
+	session_data = psp_copy_user_blob(params.session_uaddr,
+			params.session_len);
+	if (IS_ERR(session_data)) {
+		ret = PTR_ERR(session_data);
+		goto e_free_pdh;
+	}
+
+	ret = -ENOMEM;
+	start = kzalloc(sizeof(*start), GFP_KERNEL);
+	if (!start)
+		goto e_free_session;
+
+	start->handle = params.handle;
+	start->policy = params.policy;
+	start->pdh_cert_address = __psp_pa(pdh_data);
+	start->pdh_cert_len = params.pdh_len;
+	start->session_address = __psp_pa(session_data);
+	start->session_len = params.session_len;
+
+	/* create memory encryption context */
+	ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, start,
+				error);
+	if (ret)
+		goto e_free;
+
+	/* Bind ASID to this guest */
+	ret = sev_bind_asid(kvm, start->handle, error);
+	if (ret)
+		goto e_free;
+
+	params.handle = start->handle;
+	if (copy_to_user((void __user *)(uintptr_t)argp->data,
+			 &params, sizeof(struct kvm_sev_receive_start))) {
+		ret = -EFAULT;
+		sev_unbind_asid(kvm, start->handle);
+		goto e_free;
+	}
+
+	sev->handle = start->handle;
+	sev->fd = argp->sev_fd;
+
+e_free:
+	kfree(start);
+e_free_session:
+	kfree(session_data);
+e_free_pdh:
+	kfree(pdh_data);
+
+	return ret;
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -1471,6 +1549,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_SEND_CANCEL:
 		r = sev_send_cancel(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_RECEIVE_START:
+		r = sev_receive_start(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5d1adeb13739..248d1d266bc3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1755,6 +1755,15 @@ struct kvm_sev_send_update_data {
 	__u32 trans_len;
 };
 
+struct kvm_sev_receive_start {
+	__u32 handle;
+	__u32 policy;
+	__u64 pdh_uaddr;
+	__u32 pdh_len;
+	__u64 session_uaddr;
+	__u32 session_len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-- 
cgit v1.2.3-71-gd317


From 15fb7de1a7f5af0d5910ca4352b26f887543e26e Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 15 Apr 2021 15:55:17 +0000
Subject: KVM: SVM: Add KVM_SEV_RECEIVE_UPDATE_DATA command

The command is used for copying the incoming buffer into the
SEV guest memory space.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Steve Rutherford <srutherford@google.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Message-Id: <c5d0e3e719db7bb37ea85d79ed4db52e9da06257.1618498113.git.ashish.kalra@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/amd-memory-encryption.rst | 24 +++++++
 arch/x86/kvm/svm/sev.c                           | 79 ++++++++++++++++++++++++
 include/uapi/linux/kvm.h                         |  9 +++
 3 files changed, 112 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst
index 86c9b36f4a57..bb94c6e3615f 100644
--- a/Documentation/virt/kvm/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/amd-memory-encryption.rst
@@ -394,6 +394,30 @@ On success, the 'handle' field contains a new handle and on error, a negative va
 
 For more details, see SEV spec Section 6.12.
 
+16. KVM_SEV_RECEIVE_UPDATE_DATA
+----------------------------
+
+The KVM_SEV_RECEIVE_UPDATE_DATA command can be used by the hypervisor to copy
+the incoming buffers into the guest memory region with encryption context
+created during the KVM_SEV_RECEIVE_START.
+
+Parameters (in): struct kvm_sev_receive_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_receive_update_data {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the destination guest memory region */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the incoming buffer memory region  */
+                __u32 trans_len;
+        };
+
 References
 ==========
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 3e5064c69124..ff6435527a67 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1475,6 +1475,82 @@ e_free_pdh:
 	return ret;
 }
 
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct kvm_sev_receive_update_data params;
+	struct sev_data_receive_update_data *data;
+	void *hdr = NULL, *trans = NULL;
+	struct page **guest_page;
+	unsigned long n;
+	int ret, offset;
+
+	if (!sev_guest(kvm))
+		return -EINVAL;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+			sizeof(struct kvm_sev_receive_update_data)))
+		return -EFAULT;
+
+	if (!params.hdr_uaddr || !params.hdr_len ||
+	    !params.guest_uaddr || !params.guest_len ||
+	    !params.trans_uaddr || !params.trans_len)
+		return -EINVAL;
+
+	/* Check if we are crossing the page boundary */
+	offset = params.guest_uaddr & (PAGE_SIZE - 1);
+	if ((params.guest_len + offset > PAGE_SIZE))
+		return -EINVAL;
+
+	hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+	if (IS_ERR(hdr))
+		return PTR_ERR(hdr);
+
+	trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto e_free_hdr;
+	}
+
+	ret = -ENOMEM;
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto e_free_trans;
+
+	data->hdr_address = __psp_pa(hdr);
+	data->hdr_len = params.hdr_len;
+	data->trans_address = __psp_pa(trans);
+	data->trans_len = params.trans_len;
+
+	/* Pin guest memory */
+	ret = -EFAULT;
+	guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+				    PAGE_SIZE, &n, 0);
+	if (!guest_page)
+		goto e_free;
+
+	/* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+	data->guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) +
+				offset;
+	data->guest_address |= sev_me_mask;
+	data->guest_len = params.guest_len;
+	data->handle = sev->handle;
+
+	ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, data,
+				&argp->error);
+
+	sev_unpin_memory(kvm, guest_page, n);
+
+e_free:
+	kfree(data);
+e_free_trans:
+	kfree(trans);
+e_free_hdr:
+	kfree(hdr);
+
+	return ret;
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -1552,6 +1628,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_RECEIVE_START:
 		r = sev_receive_start(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_RECEIVE_UPDATE_DATA:
+		r = sev_receive_update_data(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 248d1d266bc3..d76533498543 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1764,6 +1764,15 @@ struct kvm_sev_receive_start {
 	__u32 session_len;
 };
 
+struct kvm_sev_receive_update_data {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 guest_uaddr;
+	__u32 guest_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-- 
cgit v1.2.3-71-gd317


From cb2c7d1a1776057c9a1f48ed1250d85e94d4850d Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@linux.microsoft.com>
Date: Thu, 22 Apr 2021 17:41:17 +0200
Subject: landlock: Support filesystem access-control
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using Landlock objects and ruleset, it is possible to tag inodes
according to a process's domain.  To enable an unprivileged process to
express a file hierarchy, it first needs to open a directory (or a file)
and pass this file descriptor to the kernel through
landlock_add_rule(2).  When checking if a file access request is
allowed, we walk from the requested dentry to the real root, following
the different mount layers.  The access to each "tagged" inodes are
collected according to their rule layer level, and ANDed to create
access to the requested file hierarchy.  This makes possible to identify
a lot of files without tagging every inodes nor modifying the
filesystem, while still following the view and understanding the user
has from the filesystem.

Add a new ARCH_EPHEMERAL_INODES for UML because it currently does not
keep the same struct inodes for the same inodes whereas these inodes are
in use.

This commit adds a minimal set of supported filesystem access-control
which doesn't enable to restrict all file-related actions.  This is the
result of multiple discussions to minimize the code of Landlock to ease
review.  Thanks to the Landlock design, extending this access-control
without breaking user space will not be a problem.  Moreover, seccomp
filters can be used to restrict the use of syscall families which may
not be currently handled by Landlock.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Mickaël Salaün <mic@linux.microsoft.com>
Link: https://lore.kernel.org/r/20210422154123.13086-8-mic@digikod.net
Signed-off-by: James Morris <jamorris@linux.microsoft.com>
---
 MAINTAINERS                   |   1 +
 arch/Kconfig                  |   7 +
 arch/um/Kconfig               |   1 +
 include/uapi/linux/landlock.h |  76 +++++
 security/landlock/Kconfig     |   2 +-
 security/landlock/Makefile    |   2 +-
 security/landlock/fs.c        | 692 ++++++++++++++++++++++++++++++++++++++++++
 security/landlock/fs.h        |  70 +++++
 security/landlock/limits.h    |   4 +
 security/landlock/ruleset.c   |   4 +
 security/landlock/setup.c     |   7 +
 security/landlock/setup.h     |   2 +
 12 files changed, 866 insertions(+), 2 deletions(-)
 create mode 100644 include/uapi/linux/landlock.h
 create mode 100644 security/landlock/fs.c
 create mode 100644 security/landlock/fs.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 87a2738dfdec..70ec117efa8a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10003,6 +10003,7 @@ L:	linux-security-module@vger.kernel.org
 S:	Supported
 W:	https://landlock.io
 T:	git https://github.com/landlock-lsm/linux.git
+F:	include/uapi/linux/landlock.h
 F:	security/landlock/
 K:	landlock
 K:	LANDLOCK
diff --git a/arch/Kconfig b/arch/Kconfig
index ecfd3520b676..8160ab7e3e03 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1013,6 +1013,13 @@ config COMPAT_32BIT_TIME
 config ARCH_NO_PREEMPT
 	bool
 
+config ARCH_EPHEMERAL_INODES
+	def_bool n
+	help
+	  An arch should select this symbol if it doesn't keep track of inode
+	  instances on its own, but instead relies on something else (e.g. the
+	  host kernel for an UML kernel).
+
 config ARCH_SUPPORTS_RT
 	bool
 
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index c3030db3325f..57cfd9a1c082 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -5,6 +5,7 @@ menu "UML-specific options"
 config UML
 	bool
 	default y
+	select ARCH_EPHEMERAL_INODES
 	select ARCH_HAS_KCOV
 	select ARCH_NO_PREEMPT
 	select HAVE_ARCH_AUDITSYSCALL
diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
new file mode 100644
index 000000000000..b1a81b5a8b86
--- /dev/null
+++ b/include/uapi/linux/landlock.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Landlock - User space API
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#ifndef _UAPI_LINUX_LANDLOCK_H
+#define _UAPI_LINUX_LANDLOCK_H
+
+/**
+ * DOC: fs_access
+ *
+ * A set of actions on kernel objects may be defined by an attribute (e.g.
+ * &struct landlock_path_beneath_attr) including a bitmask of access.
+ *
+ * Filesystem flags
+ * ~~~~~~~~~~~~~~~~
+ *
+ * These flags enable to restrict a sandboxed process to a set of actions on
+ * files and directories.  Files or directories opened before the sandboxing
+ * are not subject to these restrictions.
+ *
+ * A file can only receive these access rights:
+ *
+ * - %LANDLOCK_ACCESS_FS_EXECUTE: Execute a file.
+ * - %LANDLOCK_ACCESS_FS_WRITE_FILE: Open a file with write access.
+ * - %LANDLOCK_ACCESS_FS_READ_FILE: Open a file with read access.
+ *
+ * A directory can receive access rights related to files or directories.  The
+ * following access right is applied to the directory itself, and the
+ * directories beneath it:
+ *
+ * - %LANDLOCK_ACCESS_FS_READ_DIR: Open a directory or list its content.
+ *
+ * However, the following access rights only apply to the content of a
+ * directory, not the directory itself:
+ *
+ * - %LANDLOCK_ACCESS_FS_REMOVE_DIR: Remove an empty directory or rename one.
+ * - %LANDLOCK_ACCESS_FS_REMOVE_FILE: Unlink (or rename) a file.
+ * - %LANDLOCK_ACCESS_FS_MAKE_CHAR: Create (or rename or link) a character
+ *   device.
+ * - %LANDLOCK_ACCESS_FS_MAKE_DIR: Create (or rename) a directory.
+ * - %LANDLOCK_ACCESS_FS_MAKE_REG: Create (or rename or link) a regular file.
+ * - %LANDLOCK_ACCESS_FS_MAKE_SOCK: Create (or rename or link) a UNIX domain
+ *   socket.
+ * - %LANDLOCK_ACCESS_FS_MAKE_FIFO: Create (or rename or link) a named pipe.
+ * - %LANDLOCK_ACCESS_FS_MAKE_BLOCK: Create (or rename or link) a block device.
+ * - %LANDLOCK_ACCESS_FS_MAKE_SYM: Create (or rename or link) a symbolic link.
+ *
+ * .. warning::
+ *
+ *   It is currently not possible to restrict some file-related actions
+ *   accessible through these syscall families: :manpage:`chdir(2)`,
+ *   :manpage:`truncate(2)`, :manpage:`stat(2)`, :manpage:`flock(2)`,
+ *   :manpage:`chmod(2)`, :manpage:`chown(2)`, :manpage:`setxattr(2)`,
+ *   :manpage:`utime(2)`, :manpage:`ioctl(2)`, :manpage:`fcntl(2)`,
+ *   :manpage:`access(2)`.
+ *   Future Landlock evolutions will enable to restrict them.
+ */
+#define LANDLOCK_ACCESS_FS_EXECUTE			(1ULL << 0)
+#define LANDLOCK_ACCESS_FS_WRITE_FILE			(1ULL << 1)
+#define LANDLOCK_ACCESS_FS_READ_FILE			(1ULL << 2)
+#define LANDLOCK_ACCESS_FS_READ_DIR			(1ULL << 3)
+#define LANDLOCK_ACCESS_FS_REMOVE_DIR			(1ULL << 4)
+#define LANDLOCK_ACCESS_FS_REMOVE_FILE			(1ULL << 5)
+#define LANDLOCK_ACCESS_FS_MAKE_CHAR			(1ULL << 6)
+#define LANDLOCK_ACCESS_FS_MAKE_DIR			(1ULL << 7)
+#define LANDLOCK_ACCESS_FS_MAKE_REG			(1ULL << 8)
+#define LANDLOCK_ACCESS_FS_MAKE_SOCK			(1ULL << 9)
+#define LANDLOCK_ACCESS_FS_MAKE_FIFO			(1ULL << 10)
+#define LANDLOCK_ACCESS_FS_MAKE_BLOCK			(1ULL << 11)
+#define LANDLOCK_ACCESS_FS_MAKE_SYM			(1ULL << 12)
+
+#endif /* _UAPI_LINUX_LANDLOCK_H */
diff --git a/security/landlock/Kconfig b/security/landlock/Kconfig
index c1e862a38410..8e33c4e8ffb8 100644
--- a/security/landlock/Kconfig
+++ b/security/landlock/Kconfig
@@ -2,7 +2,7 @@
 
 config SECURITY_LANDLOCK
 	bool "Landlock support"
-	depends on SECURITY
+	depends on SECURITY && !ARCH_EPHEMERAL_INODES
 	select SECURITY_PATH
 	help
 	  Landlock is a sandboxing mechanism that enables processes to restrict
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index f1d1eb72fa76..92e3d80ab8ed 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
 
 landlock-y := setup.o object.o ruleset.o \
-	cred.o ptrace.o
+	cred.o ptrace.o fs.o
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
new file mode 100644
index 000000000000..97b8e421f617
--- /dev/null
+++ b/security/landlock/fs.c
@@ -0,0 +1,692 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - Filesystem management and hooks
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bits.h>
+#include <linux/compiler_types.h>
+#include <linux/dcache.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/lsm_hooks.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/types.h>
+#include <linux/wait_bit.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/landlock.h>
+
+#include "common.h"
+#include "cred.h"
+#include "fs.h"
+#include "limits.h"
+#include "object.h"
+#include "ruleset.h"
+#include "setup.h"
+
+/* Underlying object management */
+
+static void release_inode(struct landlock_object *const object)
+	__releases(object->lock)
+{
+	struct inode *const inode = object->underobj;
+	struct super_block *sb;
+
+	if (!inode) {
+		spin_unlock(&object->lock);
+		return;
+	}
+
+	/*
+	 * Protects against concurrent use by hook_sb_delete() of the reference
+	 * to the underlying inode.
+	 */
+	object->underobj = NULL;
+	/*
+	 * Makes sure that if the filesystem is concurrently unmounted,
+	 * hook_sb_delete() will wait for us to finish iput().
+	 */
+	sb = inode->i_sb;
+	atomic_long_inc(&landlock_superblock(sb)->inode_refs);
+	spin_unlock(&object->lock);
+	/*
+	 * Because object->underobj was not NULL, hook_sb_delete() and
+	 * get_inode_object() guarantee that it is safe to reset
+	 * landlock_inode(inode)->object while it is not NULL.  It is therefore
+	 * not necessary to lock inode->i_lock.
+	 */
+	rcu_assign_pointer(landlock_inode(inode)->object, NULL);
+	/*
+	 * Now, new rules can safely be tied to @inode with get_inode_object().
+	 */
+
+	iput(inode);
+	if (atomic_long_dec_and_test(&landlock_superblock(sb)->inode_refs))
+		wake_up_var(&landlock_superblock(sb)->inode_refs);
+}
+
+static const struct landlock_object_underops landlock_fs_underops = {
+	.release = release_inode
+};
+
+/* Ruleset management */
+
+static struct landlock_object *get_inode_object(struct inode *const inode)
+{
+	struct landlock_object *object, *new_object;
+	struct landlock_inode_security *inode_sec = landlock_inode(inode);
+
+	rcu_read_lock();
+retry:
+	object = rcu_dereference(inode_sec->object);
+	if (object) {
+		if (likely(refcount_inc_not_zero(&object->usage))) {
+			rcu_read_unlock();
+			return object;
+		}
+		/*
+		 * We are racing with release_inode(), the object is going
+		 * away.  Wait for release_inode(), then retry.
+		 */
+		spin_lock(&object->lock);
+		spin_unlock(&object->lock);
+		goto retry;
+	}
+	rcu_read_unlock();
+
+	/*
+	 * If there is no object tied to @inode, then create a new one (without
+	 * holding any locks).
+	 */
+	new_object = landlock_create_object(&landlock_fs_underops, inode);
+	if (IS_ERR(new_object))
+		return new_object;
+
+	/*
+	 * Protects against concurrent calls to get_inode_object() or
+	 * hook_sb_delete().
+	 */
+	spin_lock(&inode->i_lock);
+	if (unlikely(rcu_access_pointer(inode_sec->object))) {
+		/* Someone else just created the object, bail out and retry. */
+		spin_unlock(&inode->i_lock);
+		kfree(new_object);
+
+		rcu_read_lock();
+		goto retry;
+	}
+
+	/*
+	 * @inode will be released by hook_sb_delete() on its superblock
+	 * shutdown, or by release_inode() when no more ruleset references the
+	 * related object.
+	 */
+	ihold(inode);
+	rcu_assign_pointer(inode_sec->object, new_object);
+	spin_unlock(&inode->i_lock);
+	return new_object;
+}
+
+/* All access rights that can be tied to files. */
+#define ACCESS_FILE ( \
+	LANDLOCK_ACCESS_FS_EXECUTE | \
+	LANDLOCK_ACCESS_FS_WRITE_FILE | \
+	LANDLOCK_ACCESS_FS_READ_FILE)
+
+/*
+ * @path: Should have been checked by get_path_from_fd().
+ */
+int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
+		const struct path *const path, u32 access_rights)
+{
+	int err;
+	struct landlock_object *object;
+
+	/* Files only get access rights that make sense. */
+	if (!d_is_dir(path->dentry) && (access_rights | ACCESS_FILE) !=
+			ACCESS_FILE)
+		return -EINVAL;
+	if (WARN_ON_ONCE(ruleset->num_layers != 1))
+		return -EINVAL;
+
+	/* Transforms relative access rights to absolute ones. */
+	access_rights |= LANDLOCK_MASK_ACCESS_FS & ~ruleset->fs_access_masks[0];
+	object = get_inode_object(d_backing_inode(path->dentry));
+	if (IS_ERR(object))
+		return PTR_ERR(object);
+	mutex_lock(&ruleset->lock);
+	err = landlock_insert_rule(ruleset, object, access_rights);
+	mutex_unlock(&ruleset->lock);
+	/*
+	 * No need to check for an error because landlock_insert_rule()
+	 * increments the refcount for the new object if needed.
+	 */
+	landlock_put_object(object);
+	return err;
+}
+
+/* Access-control management */
+
+static inline u64 unmask_layers(
+		const struct landlock_ruleset *const domain,
+		const struct path *const path, const u32 access_request,
+		u64 layer_mask)
+{
+	const struct landlock_rule *rule;
+	const struct inode *inode;
+	size_t i;
+
+	if (d_is_negative(path->dentry))
+		/* Ignore nonexistent leafs. */
+		return layer_mask;
+	inode = d_backing_inode(path->dentry);
+	rcu_read_lock();
+	rule = landlock_find_rule(domain,
+			rcu_dereference(landlock_inode(inode)->object));
+	rcu_read_unlock();
+	if (!rule)
+		return layer_mask;
+
+	/*
+	 * An access is granted if, for each policy layer, at least one rule
+	 * encountered on the pathwalk grants the requested accesses,
+	 * regardless of their position in the layer stack.  We must then check
+	 * the remaining layers for each inode, from the first added layer to
+	 * the last one.
+	 */
+	for (i = 0; i < rule->num_layers; i++) {
+		const struct landlock_layer *const layer = &rule->layers[i];
+		const u64 layer_level = BIT_ULL(layer->level - 1);
+
+		/* Checks that the layer grants access to the full request. */
+		if ((layer->access & access_request) == access_request) {
+			layer_mask &= ~layer_level;
+
+			if (layer_mask == 0)
+				return layer_mask;
+		}
+	}
+	return layer_mask;
+}
+
+static int check_access_path(const struct landlock_ruleset *const domain,
+		const struct path *const path, u32 access_request)
+{
+	bool allowed = false;
+	struct path walker_path;
+	u64 layer_mask;
+	size_t i;
+
+	/* Make sure all layers can be checked. */
+	BUILD_BUG_ON(BITS_PER_TYPE(layer_mask) < LANDLOCK_MAX_NUM_LAYERS);
+
+	if (!access_request)
+		return 0;
+	if (WARN_ON_ONCE(!domain || !path))
+		return 0;
+	/*
+	 * Allows access to pseudo filesystems that will never be mountable
+	 * (e.g. sockfs, pipefs), but can still be reachable through
+	 * /proc/<pid>/fd/<file-descriptor> .
+	 */
+	if ((path->dentry->d_sb->s_flags & SB_NOUSER) ||
+			(d_is_positive(path->dentry) &&
+			 unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))))
+		return 0;
+	if (WARN_ON_ONCE(domain->num_layers < 1))
+		return -EACCES;
+
+	/* Saves all layers handling a subset of requested accesses. */
+	layer_mask = 0;
+	for (i = 0; i < domain->num_layers; i++) {
+		if (domain->fs_access_masks[i] & access_request)
+			layer_mask |= BIT_ULL(i);
+	}
+	/* An access request not handled by the domain is allowed. */
+	if (layer_mask == 0)
+		return 0;
+
+	walker_path = *path;
+	path_get(&walker_path);
+	/*
+	 * We need to walk through all the hierarchy to not miss any relevant
+	 * restriction.
+	 */
+	while (true) {
+		struct dentry *parent_dentry;
+
+		layer_mask = unmask_layers(domain, &walker_path,
+				access_request, layer_mask);
+		if (layer_mask == 0) {
+			/* Stops when a rule from each layer grants access. */
+			allowed = true;
+			break;
+		}
+
+jump_up:
+		if (walker_path.dentry == walker_path.mnt->mnt_root) {
+			if (follow_up(&walker_path)) {
+				/* Ignores hidden mount points. */
+				goto jump_up;
+			} else {
+				/*
+				 * Stops at the real root.  Denies access
+				 * because not all layers have granted access.
+				 */
+				allowed = false;
+				break;
+			}
+		}
+		if (unlikely(IS_ROOT(walker_path.dentry))) {
+			/*
+			 * Stops at disconnected root directories.  Only allows
+			 * access to internal filesystems (e.g. nsfs, which is
+			 * reachable through /proc/<pid>/ns/<namespace>).
+			 */
+			allowed = !!(walker_path.mnt->mnt_flags & MNT_INTERNAL);
+			break;
+		}
+		parent_dentry = dget_parent(walker_path.dentry);
+		dput(walker_path.dentry);
+		walker_path.dentry = parent_dentry;
+	}
+	path_put(&walker_path);
+	return allowed ? 0 : -EACCES;
+}
+
+static inline int current_check_access_path(const struct path *const path,
+		const u32 access_request)
+{
+	const struct landlock_ruleset *const dom =
+		landlock_get_current_domain();
+
+	if (!dom)
+		return 0;
+	return check_access_path(dom, path, access_request);
+}
+
+/* Inode hooks */
+
+static void hook_inode_free_security(struct inode *const inode)
+{
+	/*
+	 * All inodes must already have been untied from their object by
+	 * release_inode() or hook_sb_delete().
+	 */
+	WARN_ON_ONCE(landlock_inode(inode)->object);
+}
+
+/* Super-block hooks */
+
+/*
+ * Release the inodes used in a security policy.
+ *
+ * Cf. fsnotify_unmount_inodes() and invalidate_inodes()
+ */
+static void hook_sb_delete(struct super_block *const sb)
+{
+	struct inode *inode, *prev_inode = NULL;
+
+	if (!landlock_initialized)
+		return;
+
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		struct landlock_object *object;
+
+		/* Only handles referenced inodes. */
+		if (!atomic_read(&inode->i_count))
+			continue;
+
+		/*
+		 * Protects against concurrent modification of inode (e.g.
+		 * from get_inode_object()).
+		 */
+		spin_lock(&inode->i_lock);
+		/*
+		 * Checks I_FREEING and I_WILL_FREE  to protect against a race
+		 * condition when release_inode() just called iput(), which
+		 * could lead to a NULL dereference of inode->security or a
+		 * second call to iput() for the same Landlock object.  Also
+		 * checks I_NEW because such inode cannot be tied to an object.
+		 */
+		if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
+		rcu_read_lock();
+		object = rcu_dereference(landlock_inode(inode)->object);
+		if (!object) {
+			rcu_read_unlock();
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+		/* Keeps a reference to this inode until the next loop walk. */
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+
+		/*
+		 * If there is no concurrent release_inode() ongoing, then we
+		 * are in charge of calling iput() on this inode, otherwise we
+		 * will just wait for it to finish.
+		 */
+		spin_lock(&object->lock);
+		if (object->underobj == inode) {
+			object->underobj = NULL;
+			spin_unlock(&object->lock);
+			rcu_read_unlock();
+
+			/*
+			 * Because object->underobj was not NULL,
+			 * release_inode() and get_inode_object() guarantee
+			 * that it is safe to reset
+			 * landlock_inode(inode)->object while it is not NULL.
+			 * It is therefore not necessary to lock inode->i_lock.
+			 */
+			rcu_assign_pointer(landlock_inode(inode)->object, NULL);
+			/*
+			 * At this point, we own the ihold() reference that was
+			 * originally set up by get_inode_object() and the
+			 * __iget() reference that we just set in this loop
+			 * walk.  Therefore the following call to iput() will
+			 * not sleep nor drop the inode because there is now at
+			 * least two references to it.
+			 */
+			iput(inode);
+		} else {
+			spin_unlock(&object->lock);
+			rcu_read_unlock();
+		}
+
+		if (prev_inode) {
+			/*
+			 * At this point, we still own the __iget() reference
+			 * that we just set in this loop walk.  Therefore we
+			 * can drop the list lock and know that the inode won't
+			 * disappear from under us until the next loop walk.
+			 */
+			spin_unlock(&sb->s_inode_list_lock);
+			/*
+			 * We can now actually put the inode reference from the
+			 * previous loop walk, which is not needed anymore.
+			 */
+			iput(prev_inode);
+			cond_resched();
+			spin_lock(&sb->s_inode_list_lock);
+		}
+		prev_inode = inode;
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+
+	/* Puts the inode reference from the last loop walk, if any. */
+	if (prev_inode)
+		iput(prev_inode);
+	/* Waits for pending iput() in release_inode(). */
+	wait_var_event(&landlock_superblock(sb)->inode_refs, !atomic_long_read(
+				&landlock_superblock(sb)->inode_refs));
+}
+
+/*
+ * Because a Landlock security policy is defined according to the filesystem
+ * topology (i.e. the mount namespace), changing it may grant access to files
+ * not previously allowed.
+ *
+ * To make it simple, deny any filesystem topology modification by landlocked
+ * processes.  Non-landlocked processes may still change the namespace of a
+ * landlocked process, but this kind of threat must be handled by a system-wide
+ * access-control security policy.
+ *
+ * This could be lifted in the future if Landlock can safely handle mount
+ * namespace updates requested by a landlocked process.  Indeed, we could
+ * update the current domain (which is currently read-only) by taking into
+ * account the accesses of the source and the destination of a new mount point.
+ * However, it would also require to make all the child domains dynamically
+ * inherit these new constraints.  Anyway, for backward compatibility reasons,
+ * a dedicated user space option would be required (e.g. as a ruleset flag).
+ */
+static int hook_sb_mount(const char *const dev_name,
+		const struct path *const path, const char *const type,
+		const unsigned long flags, void *const data)
+{
+	if (!landlock_get_current_domain())
+		return 0;
+	return -EPERM;
+}
+
+static int hook_move_mount(const struct path *const from_path,
+		const struct path *const to_path)
+{
+	if (!landlock_get_current_domain())
+		return 0;
+	return -EPERM;
+}
+
+/*
+ * Removing a mount point may reveal a previously hidden file hierarchy, which
+ * may then grant access to files, which may have previously been forbidden.
+ */
+static int hook_sb_umount(struct vfsmount *const mnt, const int flags)
+{
+	if (!landlock_get_current_domain())
+		return 0;
+	return -EPERM;
+}
+
+static int hook_sb_remount(struct super_block *const sb, void *const mnt_opts)
+{
+	if (!landlock_get_current_domain())
+		return 0;
+	return -EPERM;
+}
+
+/*
+ * pivot_root(2), like mount(2), changes the current mount namespace.  It must
+ * then be forbidden for a landlocked process.
+ *
+ * However, chroot(2) may be allowed because it only changes the relative root
+ * directory of the current process.  Moreover, it can be used to restrict the
+ * view of the filesystem.
+ */
+static int hook_sb_pivotroot(const struct path *const old_path,
+		const struct path *const new_path)
+{
+	if (!landlock_get_current_domain())
+		return 0;
+	return -EPERM;
+}
+
+/* Path hooks */
+
+static inline u32 get_mode_access(const umode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFLNK:
+		return LANDLOCK_ACCESS_FS_MAKE_SYM;
+	case 0:
+		/* A zero mode translates to S_IFREG. */
+	case S_IFREG:
+		return LANDLOCK_ACCESS_FS_MAKE_REG;
+	case S_IFDIR:
+		return LANDLOCK_ACCESS_FS_MAKE_DIR;
+	case S_IFCHR:
+		return LANDLOCK_ACCESS_FS_MAKE_CHAR;
+	case S_IFBLK:
+		return LANDLOCK_ACCESS_FS_MAKE_BLOCK;
+	case S_IFIFO:
+		return LANDLOCK_ACCESS_FS_MAKE_FIFO;
+	case S_IFSOCK:
+		return LANDLOCK_ACCESS_FS_MAKE_SOCK;
+	default:
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+}
+
+/*
+ * Creating multiple links or renaming may lead to privilege escalations if not
+ * handled properly.  Indeed, we must be sure that the source doesn't gain more
+ * privileges by being accessible from the destination.  This is getting more
+ * complex when dealing with multiple layers.  The whole picture can be seen as
+ * a multilayer partial ordering problem.  A future version of Landlock will
+ * deal with that.
+ */
+static int hook_path_link(struct dentry *const old_dentry,
+		const struct path *const new_dir,
+		struct dentry *const new_dentry)
+{
+	const struct landlock_ruleset *const dom =
+		landlock_get_current_domain();
+
+	if (!dom)
+		return 0;
+	/* The mount points are the same for old and new paths, cf. EXDEV. */
+	if (old_dentry->d_parent != new_dir->dentry)
+		/* Gracefully forbids reparenting. */
+		return -EXDEV;
+	if (unlikely(d_is_negative(old_dentry)))
+		return -ENOENT;
+	return check_access_path(dom, new_dir,
+			get_mode_access(d_backing_inode(old_dentry)->i_mode));
+}
+
+static inline u32 maybe_remove(const struct dentry *const dentry)
+{
+	if (d_is_negative(dentry))
+		return 0;
+	return d_is_dir(dentry) ? LANDLOCK_ACCESS_FS_REMOVE_DIR :
+		LANDLOCK_ACCESS_FS_REMOVE_FILE;
+}
+
+static int hook_path_rename(const struct path *const old_dir,
+		struct dentry *const old_dentry,
+		const struct path *const new_dir,
+		struct dentry *const new_dentry)
+{
+	const struct landlock_ruleset *const dom =
+		landlock_get_current_domain();
+
+	if (!dom)
+		return 0;
+	/* The mount points are the same for old and new paths, cf. EXDEV. */
+	if (old_dir->dentry != new_dir->dentry)
+		/* Gracefully forbids reparenting. */
+		return -EXDEV;
+	if (unlikely(d_is_negative(old_dentry)))
+		return -ENOENT;
+	/* RENAME_EXCHANGE is handled because directories are the same. */
+	return check_access_path(dom, old_dir, maybe_remove(old_dentry) |
+			maybe_remove(new_dentry) |
+			get_mode_access(d_backing_inode(old_dentry)->i_mode));
+}
+
+static int hook_path_mkdir(const struct path *const dir,
+		struct dentry *const dentry, const umode_t mode)
+{
+	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_DIR);
+}
+
+static int hook_path_mknod(const struct path *const dir,
+		struct dentry *const dentry, const umode_t mode,
+		const unsigned int dev)
+{
+	const struct landlock_ruleset *const dom =
+		landlock_get_current_domain();
+
+	if (!dom)
+		return 0;
+	return check_access_path(dom, dir, get_mode_access(mode));
+}
+
+static int hook_path_symlink(const struct path *const dir,
+		struct dentry *const dentry, const char *const old_name)
+{
+	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_SYM);
+}
+
+static int hook_path_unlink(const struct path *const dir,
+		struct dentry *const dentry)
+{
+	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_FILE);
+}
+
+static int hook_path_rmdir(const struct path *const dir,
+		struct dentry *const dentry)
+{
+	return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_DIR);
+}
+
+/* File hooks */
+
+static inline u32 get_file_access(const struct file *const file)
+{
+	u32 access = 0;
+
+	if (file->f_mode & FMODE_READ) {
+		/* A directory can only be opened in read mode. */
+		if (S_ISDIR(file_inode(file)->i_mode))
+			return LANDLOCK_ACCESS_FS_READ_DIR;
+		access = LANDLOCK_ACCESS_FS_READ_FILE;
+	}
+	if (file->f_mode & FMODE_WRITE)
+		access |= LANDLOCK_ACCESS_FS_WRITE_FILE;
+	/* __FMODE_EXEC is indeed part of f_flags, not f_mode. */
+	if (file->f_flags & __FMODE_EXEC)
+		access |= LANDLOCK_ACCESS_FS_EXECUTE;
+	return access;
+}
+
+static int hook_file_open(struct file *const file)
+{
+	const struct landlock_ruleset *const dom =
+		landlock_get_current_domain();
+
+	if (!dom)
+		return 0;
+	/*
+	 * Because a file may be opened with O_PATH, get_file_access() may
+	 * return 0.  This case will be handled with a future Landlock
+	 * evolution.
+	 */
+	return check_access_path(dom, &file->f_path, get_file_access(file));
+}
+
+static struct security_hook_list landlock_hooks[] __lsm_ro_after_init = {
+	LSM_HOOK_INIT(inode_free_security, hook_inode_free_security),
+
+	LSM_HOOK_INIT(sb_delete, hook_sb_delete),
+	LSM_HOOK_INIT(sb_mount, hook_sb_mount),
+	LSM_HOOK_INIT(move_mount, hook_move_mount),
+	LSM_HOOK_INIT(sb_umount, hook_sb_umount),
+	LSM_HOOK_INIT(sb_remount, hook_sb_remount),
+	LSM_HOOK_INIT(sb_pivotroot, hook_sb_pivotroot),
+
+	LSM_HOOK_INIT(path_link, hook_path_link),
+	LSM_HOOK_INIT(path_rename, hook_path_rename),
+	LSM_HOOK_INIT(path_mkdir, hook_path_mkdir),
+	LSM_HOOK_INIT(path_mknod, hook_path_mknod),
+	LSM_HOOK_INIT(path_symlink, hook_path_symlink),
+	LSM_HOOK_INIT(path_unlink, hook_path_unlink),
+	LSM_HOOK_INIT(path_rmdir, hook_path_rmdir),
+
+	LSM_HOOK_INIT(file_open, hook_file_open),
+};
+
+__init void landlock_add_fs_hooks(void)
+{
+	security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
+			LANDLOCK_NAME);
+}
diff --git a/security/landlock/fs.h b/security/landlock/fs.h
new file mode 100644
index 000000000000..187284b421c9
--- /dev/null
+++ b/security/landlock/fs.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Filesystem management and hooks
+ *
+ * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#ifndef _SECURITY_LANDLOCK_FS_H
+#define _SECURITY_LANDLOCK_FS_H
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+
+#include "ruleset.h"
+#include "setup.h"
+
+/**
+ * struct landlock_inode_security - Inode security blob
+ *
+ * Enable to reference a &struct landlock_object tied to an inode (i.e.
+ * underlying object).
+ */
+struct landlock_inode_security {
+	/**
+	 * @object: Weak pointer to an allocated object.  All assignments of a
+	 * new object are protected by the underlying inode->i_lock.  However,
+	 * atomically disassociating @object from the inode is only protected
+	 * by @object->lock, from the time @object's usage refcount drops to
+	 * zero to the time this pointer is nulled out (cf. release_inode() and
+	 * hook_sb_delete()).  Indeed, such disassociation doesn't require
+	 * inode->i_lock thanks to the careful rcu_access_pointer() check
+	 * performed by get_inode_object().
+	 */
+	struct landlock_object __rcu *object;
+};
+
+/**
+ * struct landlock_superblock_security - Superblock security blob
+ *
+ * Enable hook_sb_delete() to wait for concurrent calls to release_inode().
+ */
+struct landlock_superblock_security {
+	/**
+	 * @inode_refs: Number of pending inodes (from this superblock) that
+	 * are being released by release_inode().
+	 * Cf. struct super_block->s_fsnotify_inode_refs .
+	 */
+	atomic_long_t inode_refs;
+};
+
+static inline struct landlock_inode_security *landlock_inode(
+		const struct inode *const inode)
+{
+	return inode->i_security + landlock_blob_sizes.lbs_inode;
+}
+
+static inline struct landlock_superblock_security *landlock_superblock(
+		const struct super_block *const superblock)
+{
+	return superblock->s_security + landlock_blob_sizes.lbs_superblock;
+}
+
+__init void landlock_add_fs_hooks(void);
+
+int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
+		const struct path *const path, u32 access_hierarchy);
+
+#endif /* _SECURITY_LANDLOCK_FS_H */
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index b734f597bb0e..2a0a1095ee27 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -10,8 +10,12 @@
 #define _SECURITY_LANDLOCK_LIMITS_H
 
 #include <linux/limits.h>
+#include <uapi/linux/landlock.h>
 
 #define LANDLOCK_MAX_NUM_LAYERS		64
 #define LANDLOCK_MAX_NUM_RULES		U32_MAX
 
+#define LANDLOCK_LAST_ACCESS_FS		LANDLOCK_ACCESS_FS_MAKE_SYM
+#define LANDLOCK_MASK_ACCESS_FS		((LANDLOCK_LAST_ACCESS_FS << 1) - 1)
+
 #endif /* _SECURITY_LANDLOCK_LIMITS_H */
diff --git a/security/landlock/ruleset.c b/security/landlock/ruleset.c
index 2e616f6d5274..ec72b9262bf3 100644
--- a/security/landlock/ruleset.c
+++ b/security/landlock/ruleset.c
@@ -116,9 +116,11 @@ static void build_check_ruleset(void)
 		.num_rules = ~0,
 		.num_layers = ~0,
 	};
+	typeof(ruleset.fs_access_masks[0]) fs_access_mask = ~0;
 
 	BUILD_BUG_ON(ruleset.num_rules < LANDLOCK_MAX_NUM_RULES);
 	BUILD_BUG_ON(ruleset.num_layers < LANDLOCK_MAX_NUM_LAYERS);
+	BUILD_BUG_ON(fs_access_mask < LANDLOCK_MASK_ACCESS_FS);
 }
 
 /**
@@ -217,9 +219,11 @@ static void build_check_layer(void)
 {
 	const struct landlock_layer layer = {
 		.level = ~0,
+		.access = ~0,
 	};
 
 	BUILD_BUG_ON(layer.level < LANDLOCK_MAX_NUM_LAYERS);
+	BUILD_BUG_ON(layer.access < LANDLOCK_MASK_ACCESS_FS);
 }
 
 /* @ruleset must be locked by the caller. */
diff --git a/security/landlock/setup.c b/security/landlock/setup.c
index a5d6ef334991..f8e8e980454c 100644
--- a/security/landlock/setup.c
+++ b/security/landlock/setup.c
@@ -11,17 +11,24 @@
 
 #include "common.h"
 #include "cred.h"
+#include "fs.h"
 #include "ptrace.h"
 #include "setup.h"
 
+bool landlock_initialized __lsm_ro_after_init = false;
+
 struct lsm_blob_sizes landlock_blob_sizes __lsm_ro_after_init = {
 	.lbs_cred = sizeof(struct landlock_cred_security),
+	.lbs_inode = sizeof(struct landlock_inode_security),
+	.lbs_superblock = sizeof(struct landlock_superblock_security),
 };
 
 static int __init landlock_init(void)
 {
 	landlock_add_cred_hooks();
 	landlock_add_ptrace_hooks();
+	landlock_add_fs_hooks();
+	landlock_initialized = true;
 	pr_info("Up and running.\n");
 	return 0;
 }
diff --git a/security/landlock/setup.h b/security/landlock/setup.h
index 9fdbf33fcc33..1daffab1ab4b 100644
--- a/security/landlock/setup.h
+++ b/security/landlock/setup.h
@@ -11,6 +11,8 @@
 
 #include <linux/lsm_hooks.h>
 
+extern bool landlock_initialized;
+
 extern struct lsm_blob_sizes landlock_blob_sizes;
 
 #endif /* _SECURITY_LANDLOCK_SETUP_H */
-- 
cgit v1.2.3-71-gd317


From 265885daf3e5082eb9f6e2a23bdbf9ba4456a21b Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@linux.microsoft.com>
Date: Thu, 22 Apr 2021 17:41:18 +0200
Subject: landlock: Add syscall implementations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These 3 system calls are designed to be used by unprivileged processes
to sandbox themselves:
* landlock_create_ruleset(2): Creates a ruleset and returns its file
  descriptor.
* landlock_add_rule(2): Adds a rule (e.g. file hierarchy access) to a
  ruleset, identified by the dedicated file descriptor.
* landlock_restrict_self(2): Enforces a ruleset on the calling thread
  and its future children (similar to seccomp).  This syscall has the
  same usage restrictions as seccomp(2): the caller must have the
  no_new_privs attribute set or have CAP_SYS_ADMIN in the current user
  namespace.

All these syscalls have a "flags" argument (not currently used) to
enable extensibility.

Here are the motivations for these new syscalls:
* A sandboxed process may not have access to file systems, including
  /dev, /sys or /proc, but it should still be able to add more
  restrictions to itself.
* Neither prctl(2) nor seccomp(2) (which was used in a previous version)
  fit well with the current definition of a Landlock security policy.

All passed structs (attributes) are checked at build time to ensure that
they don't contain holes and that they are aligned the same way for each
architecture.

See the user and kernel documentation for more details (provided by a
following commit):
* Documentation/userspace-api/landlock.rst
* Documentation/security/landlock.rst

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: James Morris <jmorris@namei.org>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Mickaël Salaün <mic@linux.microsoft.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Link: https://lore.kernel.org/r/20210422154123.13086-9-mic@digikod.net
Signed-off-by: James Morris <jamorris@linux.microsoft.com>
---
 include/linux/syscalls.h      |   7 +
 include/uapi/linux/landlock.h |  53 +++++
 kernel/sys_ni.c               |   5 +
 security/landlock/Makefile    |   2 +-
 security/landlock/syscalls.c  | 442 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 508 insertions(+), 1 deletion(-)
 create mode 100644 security/landlock/syscalls.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2839dc9a7c01..fa3971012e1c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -69,6 +69,8 @@ struct io_uring_params;
 struct clone_args;
 struct open_how;
 struct mount_attr;
+struct landlock_ruleset_attr;
+enum landlock_rule_type;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -1041,6 +1043,11 @@ asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
 asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
+asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr __user *attr,
+		size_t size, __u32 flags);
+asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type,
+		const void __user *rule_attr, __u32 flags);
+asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index b1a81b5a8b86..ba946a1e40b2 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -9,6 +9,59 @@
 #ifndef _UAPI_LINUX_LANDLOCK_H
 #define _UAPI_LINUX_LANDLOCK_H
 
+#include <linux/types.h>
+
+/**
+ * struct landlock_ruleset_attr - Ruleset definition
+ *
+ * Argument of sys_landlock_create_ruleset().  This structure can grow in
+ * future versions.
+ */
+struct landlock_ruleset_attr {
+	/**
+	 * @handled_access_fs: Bitmask of actions (cf. `Filesystem flags`_)
+	 * that is handled by this ruleset and should then be forbidden if no
+	 * rule explicitly allow them.  This is needed for backward
+	 * compatibility reasons.
+	 */
+	__u64 handled_access_fs;
+};
+
+/**
+ * enum landlock_rule_type - Landlock rule type
+ *
+ * Argument of sys_landlock_add_rule().
+ */
+enum landlock_rule_type {
+	/**
+	 * @LANDLOCK_RULE_PATH_BENEATH: Type of a &struct
+	 * landlock_path_beneath_attr .
+	 */
+	LANDLOCK_RULE_PATH_BENEATH = 1,
+};
+
+/**
+ * struct landlock_path_beneath_attr - Path hierarchy definition
+ *
+ * Argument of sys_landlock_add_rule().
+ */
+struct landlock_path_beneath_attr {
+	/**
+	 * @allowed_access: Bitmask of allowed actions for this file hierarchy
+	 * (cf. `Filesystem flags`_).
+	 */
+	__u64 allowed_access;
+	/**
+	 * @parent_fd: File descriptor, open with ``O_PATH``, which identifies
+	 * the parent directory of a file hierarchy, or just a file.
+	 */
+	__s32 parent_fd;
+	/*
+	 * This struct is packed to avoid trailing reserved members.
+	 * Cf. security/landlock/syscalls.c:build_check_abi()
+	 */
+} __attribute__((packed));
+
 /**
  * DOC: fs_access
  *
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 19aa806890d5..cce430cf2ff2 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -266,6 +266,11 @@ COND_SYSCALL(request_key);
 COND_SYSCALL(keyctl);
 COND_SYSCALL_COMPAT(keyctl);
 
+/* security/landlock/syscalls.c */
+COND_SYSCALL(landlock_create_ruleset);
+COND_SYSCALL(landlock_add_rule);
+COND_SYSCALL(landlock_restrict_self);
+
 /* arch/example/kernel/sys_example.c */
 
 /* mm/fadvise.c */
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index 92e3d80ab8ed..7bbd2f413b3e 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
 
-landlock-y := setup.o object.o ruleset.o \
+landlock-y := setup.o syscalls.o object.o ruleset.o \
 	cred.o ptrace.o fs.o
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
new file mode 100644
index 000000000000..93620ad7593b
--- /dev/null
+++ b/security/landlock/syscalls.c
@@ -0,0 +1,442 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - System call implementations and user space interfaces
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#include <asm/current.h>
+#include <linux/anon_inodes.h>
+#include <linux/build_bug.h>
+#include <linux/capability.h>
+#include <linux/compiler_types.h>
+#include <linux/dcache.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/sched.h>
+#include <linux/security.h>
+#include <linux/stddef.h>
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/landlock.h>
+
+#include "cred.h"
+#include "fs.h"
+#include "limits.h"
+#include "ruleset.h"
+#include "setup.h"
+
+/**
+ * copy_min_struct_from_user - Safe future-proof argument copying
+ *
+ * Extend copy_struct_from_user() to check for consistent user buffer.
+ *
+ * @dst: Kernel space pointer or NULL.
+ * @ksize: Actual size of the data pointed to by @dst.
+ * @ksize_min: Minimal required size to be copied.
+ * @src: User space pointer or NULL.
+ * @usize: (Alleged) size of the data pointed to by @src.
+ */
+static __always_inline int copy_min_struct_from_user(void *const dst,
+		const size_t ksize, const size_t ksize_min,
+		const void __user *const src, const size_t usize)
+{
+	/* Checks buffer inconsistencies. */
+	BUILD_BUG_ON(!dst);
+	if (!src)
+		return -EFAULT;
+
+	/* Checks size ranges. */
+	BUILD_BUG_ON(ksize <= 0);
+	BUILD_BUG_ON(ksize < ksize_min);
+	if (usize < ksize_min)
+		return -EINVAL;
+	if (usize > PAGE_SIZE)
+		return -E2BIG;
+
+	/* Copies user buffer and fills with zeros. */
+	return copy_struct_from_user(dst, ksize, src, usize);
+}
+
+/*
+ * This function only contains arithmetic operations with constants, leading to
+ * BUILD_BUG_ON().  The related code is evaluated and checked at build time,
+ * but it is then ignored thanks to compiler optimizations.
+ */
+static void build_check_abi(void)
+{
+	struct landlock_ruleset_attr ruleset_attr;
+	struct landlock_path_beneath_attr path_beneath_attr;
+	size_t ruleset_size, path_beneath_size;
+
+	/*
+	 * For each user space ABI structures, first checks that there is no
+	 * hole in them, then checks that all architectures have the same
+	 * struct size.
+	 */
+	ruleset_size = sizeof(ruleset_attr.handled_access_fs);
+	BUILD_BUG_ON(sizeof(ruleset_attr) != ruleset_size);
+	BUILD_BUG_ON(sizeof(ruleset_attr) != 8);
+
+	path_beneath_size = sizeof(path_beneath_attr.allowed_access);
+	path_beneath_size += sizeof(path_beneath_attr.parent_fd);
+	BUILD_BUG_ON(sizeof(path_beneath_attr) != path_beneath_size);
+	BUILD_BUG_ON(sizeof(path_beneath_attr) != 12);
+}
+
+/* Ruleset handling */
+
+static int fop_ruleset_release(struct inode *const inode,
+		struct file *const filp)
+{
+	struct landlock_ruleset *ruleset = filp->private_data;
+
+	landlock_put_ruleset(ruleset);
+	return 0;
+}
+
+static ssize_t fop_dummy_read(struct file *const filp, char __user *const buf,
+		const size_t size, loff_t *const ppos)
+{
+	/* Dummy handler to enable FMODE_CAN_READ. */
+	return -EINVAL;
+}
+
+static ssize_t fop_dummy_write(struct file *const filp,
+		const char __user *const buf, const size_t size,
+		loff_t *const ppos)
+{
+	/* Dummy handler to enable FMODE_CAN_WRITE. */
+	return -EINVAL;
+}
+
+/*
+ * A ruleset file descriptor enables to build a ruleset by adding (i.e.
+ * writing) rule after rule, without relying on the task's context.  This
+ * reentrant design is also used in a read way to enforce the ruleset on the
+ * current task.
+ */
+static const struct file_operations ruleset_fops = {
+	.release = fop_ruleset_release,
+	.read = fop_dummy_read,
+	.write = fop_dummy_write,
+};
+
+/**
+ * sys_landlock_create_ruleset - Create a new ruleset
+ *
+ * @attr: Pointer to a &struct landlock_ruleset_attr identifying the scope of
+ *        the new ruleset.
+ * @size: Size of the pointed &struct landlock_ruleset_attr (needed for
+ *        backward and forward compatibility).
+ * @flags: Must be 0.
+ *
+ * This system call enables to create a new Landlock ruleset, and returns the
+ * related file descriptor on success.
+ *
+ * Possible returned errors are:
+ *
+ * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - EINVAL: @flags is not 0, or unknown access, or too small @size;
+ * - E2BIG or EFAULT: @attr or @size inconsistencies;
+ * - ENOMSG: empty &landlock_ruleset_attr.handled_access_fs.
+ */
+SYSCALL_DEFINE3(landlock_create_ruleset,
+		const struct landlock_ruleset_attr __user *const, attr,
+		const size_t, size, const __u32, flags)
+{
+	struct landlock_ruleset_attr ruleset_attr;
+	struct landlock_ruleset *ruleset;
+	int err, ruleset_fd;
+
+	/* Build-time checks. */
+	build_check_abi();
+
+	if (!landlock_initialized)
+		return -EOPNOTSUPP;
+
+	/* No flag for now. */
+	if (flags)
+		return -EINVAL;
+
+	/* Copies raw user space buffer. */
+	err = copy_min_struct_from_user(&ruleset_attr, sizeof(ruleset_attr),
+			offsetofend(typeof(ruleset_attr), handled_access_fs),
+			attr, size);
+	if (err)
+		return err;
+
+	/* Checks content (and 32-bits cast). */
+	if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_ACCESS_FS) !=
+			LANDLOCK_MASK_ACCESS_FS)
+		return -EINVAL;
+
+	/* Checks arguments and transforms to kernel struct. */
+	ruleset = landlock_create_ruleset(ruleset_attr.handled_access_fs);
+	if (IS_ERR(ruleset))
+		return PTR_ERR(ruleset);
+
+	/* Creates anonymous FD referring to the ruleset. */
+	ruleset_fd = anon_inode_getfd("landlock-ruleset", &ruleset_fops,
+			ruleset, O_RDWR | O_CLOEXEC);
+	if (ruleset_fd < 0)
+		landlock_put_ruleset(ruleset);
+	return ruleset_fd;
+}
+
+/*
+ * Returns an owned ruleset from a FD. It is thus needed to call
+ * landlock_put_ruleset() on the return value.
+ */
+static struct landlock_ruleset *get_ruleset_from_fd(const int fd,
+		const fmode_t mode)
+{
+	struct fd ruleset_f;
+	struct landlock_ruleset *ruleset;
+
+	ruleset_f = fdget(fd);
+	if (!ruleset_f.file)
+		return ERR_PTR(-EBADF);
+
+	/* Checks FD type and access right. */
+	if (ruleset_f.file->f_op != &ruleset_fops) {
+		ruleset = ERR_PTR(-EBADFD);
+		goto out_fdput;
+	}
+	if (!(ruleset_f.file->f_mode & mode)) {
+		ruleset = ERR_PTR(-EPERM);
+		goto out_fdput;
+	}
+	ruleset = ruleset_f.file->private_data;
+	if (WARN_ON_ONCE(ruleset->num_layers != 1)) {
+		ruleset = ERR_PTR(-EINVAL);
+		goto out_fdput;
+	}
+	landlock_get_ruleset(ruleset);
+
+out_fdput:
+	fdput(ruleset_f);
+	return ruleset;
+}
+
+/* Path handling */
+
+/*
+ * @path: Must call put_path(@path) after the call if it succeeded.
+ */
+static int get_path_from_fd(const s32 fd, struct path *const path)
+{
+	struct fd f;
+	int err = 0;
+
+	BUILD_BUG_ON(!__same_type(fd,
+		((struct landlock_path_beneath_attr *)NULL)->parent_fd));
+
+	/* Handles O_PATH. */
+	f = fdget_raw(fd);
+	if (!f.file)
+		return -EBADF;
+	/*
+	 * Forbids ruleset FDs, internal filesystems (e.g. nsfs), including
+	 * pseudo filesystems that will never be mountable (e.g. sockfs,
+	 * pipefs).
+	 */
+	if ((f.file->f_op == &ruleset_fops) ||
+			(f.file->f_path.mnt->mnt_flags & MNT_INTERNAL) ||
+			(f.file->f_path.dentry->d_sb->s_flags & SB_NOUSER) ||
+			d_is_negative(f.file->f_path.dentry) ||
+			IS_PRIVATE(d_backing_inode(f.file->f_path.dentry))) {
+		err = -EBADFD;
+		goto out_fdput;
+	}
+	*path = f.file->f_path;
+	path_get(path);
+
+out_fdput:
+	fdput(f);
+	return err;
+}
+
+/**
+ * sys_landlock_add_rule - Add a new rule to a ruleset
+ *
+ * @ruleset_fd: File descriptor tied to the ruleset that should be extended
+ *		with the new rule.
+ * @rule_type: Identify the structure type pointed to by @rule_attr (only
+ *             LANDLOCK_RULE_PATH_BENEATH for now).
+ * @rule_attr: Pointer to a rule (only of type &struct
+ *             landlock_path_beneath_attr for now).
+ * @flags: Must be 0.
+ *
+ * This system call enables to define a new rule and add it to an existing
+ * ruleset.
+ *
+ * Possible returned errors are:
+ *
+ * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - EINVAL: @flags is not 0, or inconsistent access in the rule (i.e.
+ *   &landlock_path_beneath_attr.allowed_access is not a subset of the rule's
+ *   accesses);
+ * - ENOMSG: Empty accesses (e.g. &landlock_path_beneath_attr.allowed_access);
+ * - EBADF: @ruleset_fd is not a file descriptor for the current thread, or a
+ *   member of @rule_attr is not a file descriptor as expected;
+ * - EBADFD: @ruleset_fd is not a ruleset file descriptor, or a member of
+ *   @rule_attr is not the expected file descriptor type (e.g. file open
+ *   without O_PATH);
+ * - EPERM: @ruleset_fd has no write access to the underlying ruleset;
+ * - EFAULT: @rule_attr inconsistency.
+ */
+SYSCALL_DEFINE4(landlock_add_rule,
+		const int, ruleset_fd, const enum landlock_rule_type, rule_type,
+		const void __user *const, rule_attr, const __u32, flags)
+{
+	struct landlock_path_beneath_attr path_beneath_attr;
+	struct path path;
+	struct landlock_ruleset *ruleset;
+	int res, err;
+
+	if (!landlock_initialized)
+		return -EOPNOTSUPP;
+
+	/* No flag for now. */
+	if (flags)
+		return -EINVAL;
+
+	if (rule_type != LANDLOCK_RULE_PATH_BENEATH)
+		return -EINVAL;
+
+	/* Copies raw user space buffer, only one type for now. */
+	res = copy_from_user(&path_beneath_attr, rule_attr,
+			sizeof(path_beneath_attr));
+	if (res)
+		return -EFAULT;
+
+	/* Gets and checks the ruleset. */
+	ruleset = get_ruleset_from_fd(ruleset_fd, FMODE_CAN_WRITE);
+	if (IS_ERR(ruleset))
+		return PTR_ERR(ruleset);
+
+	/*
+	 * Informs about useless rule: empty allowed_access (i.e. deny rules)
+	 * are ignored in path walks.
+	 */
+	if (!path_beneath_attr.allowed_access) {
+		err = -ENOMSG;
+		goto out_put_ruleset;
+	}
+	/*
+	 * Checks that allowed_access matches the @ruleset constraints
+	 * (ruleset->fs_access_masks[0] is automatically upgraded to 64-bits).
+	 */
+	if ((path_beneath_attr.allowed_access | ruleset->fs_access_masks[0]) !=
+			ruleset->fs_access_masks[0]) {
+		err = -EINVAL;
+		goto out_put_ruleset;
+	}
+
+	/* Gets and checks the new rule. */
+	err = get_path_from_fd(path_beneath_attr.parent_fd, &path);
+	if (err)
+		goto out_put_ruleset;
+
+	/* Imports the new rule. */
+	err = landlock_append_fs_rule(ruleset, &path,
+			path_beneath_attr.allowed_access);
+	path_put(&path);
+
+out_put_ruleset:
+	landlock_put_ruleset(ruleset);
+	return err;
+}
+
+/* Enforcement */
+
+/**
+ * sys_landlock_restrict_self - Enforce a ruleset on the calling thread
+ *
+ * @ruleset_fd: File descriptor tied to the ruleset to merge with the target.
+ * @flags: Must be 0.
+ *
+ * This system call enables to enforce a Landlock ruleset on the current
+ * thread.  Enforcing a ruleset requires that the task has CAP_SYS_ADMIN in its
+ * namespace or is running with no_new_privs.  This avoids scenarios where
+ * unprivileged tasks can affect the behavior of privileged children.
+ *
+ * Possible returned errors are:
+ *
+ * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - EINVAL: @flags is not 0.
+ * - EBADF: @ruleset_fd is not a file descriptor for the current thread;
+ * - EBADFD: @ruleset_fd is not a ruleset file descriptor;
+ * - EPERM: @ruleset_fd has no read access to the underlying ruleset, or the
+ *   current thread is not running with no_new_privs, or it doesn't have
+ *   CAP_SYS_ADMIN in its namespace.
+ * - E2BIG: The maximum number of stacked rulesets is reached for the current
+ *   thread.
+ */
+SYSCALL_DEFINE2(landlock_restrict_self,
+		const int, ruleset_fd, const __u32, flags)
+{
+	struct landlock_ruleset *new_dom, *ruleset;
+	struct cred *new_cred;
+	struct landlock_cred_security *new_llcred;
+	int err;
+
+	if (!landlock_initialized)
+		return -EOPNOTSUPP;
+
+	/* No flag for now. */
+	if (flags)
+		return -EINVAL;
+
+	/*
+	 * Similar checks as for seccomp(2), except that an -EPERM may be
+	 * returned.
+	 */
+	if (!task_no_new_privs(current) &&
+			!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Gets and checks the ruleset. */
+	ruleset = get_ruleset_from_fd(ruleset_fd, FMODE_CAN_READ);
+	if (IS_ERR(ruleset))
+		return PTR_ERR(ruleset);
+
+	/* Prepares new credentials. */
+	new_cred = prepare_creds();
+	if (!new_cred) {
+		err = -ENOMEM;
+		goto out_put_ruleset;
+	}
+	new_llcred = landlock_cred(new_cred);
+
+	/*
+	 * There is no possible race condition while copying and manipulating
+	 * the current credentials because they are dedicated per thread.
+	 */
+	new_dom = landlock_merge_ruleset(new_llcred->domain, ruleset);
+	if (IS_ERR(new_dom)) {
+		err = PTR_ERR(new_dom);
+		goto out_put_creds;
+	}
+
+	/* Replaces the old (prepared) domain. */
+	landlock_put_ruleset(new_llcred->domain);
+	new_llcred->domain = new_dom;
+
+	landlock_put_ruleset(ruleset);
+	return commit_creds(new_cred);
+
+out_put_creds:
+	abort_creds(new_cred);
+
+out_put_ruleset:
+	landlock_put_ruleset(ruleset);
+	return err;
+}
-- 
cgit v1.2.3-71-gd317


From 3532b0b4352ce79400b0aa68414f1a0fc422b920 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@linux.microsoft.com>
Date: Thu, 22 Apr 2021 17:41:23 +0200
Subject: landlock: Enable user space to infer supported features
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new flag LANDLOCK_CREATE_RULESET_VERSION to
landlock_create_ruleset(2).  This enables to retreive a Landlock ABI
version that is useful to efficiently follow a best-effort security
approach.  Indeed, it would be a missed opportunity to abort the whole
sandbox building, because some features are unavailable, instead of
protecting users as much as possible with the subset of features
provided by the running kernel.

This new flag enables user space to identify the minimum set of Landlock
features supported by the running kernel without relying on a filesystem
interface (e.g. /proc/version, which might be inaccessible) nor testing
multiple syscall argument combinations (i.e. syscall bisection).  New
Landlock features will be documented and tied to a minimum version
number (greater than 1).  The current version will be incremented for
each new kernel release supporting new Landlock features.  User space
libraries can leverage this information to seamlessly restrict processes
as much as possible while being compatible with newer APIs.

This is a much more lighter approach than the previous
landlock_get_features(2): the complexity is pushed to user space
libraries.  This flag meets similar needs as securityfs versions:
selinux/policyvers, apparmor/features/*/version* and tomoyo/version.

Supporting this flag now will be convenient for backward compatibility.

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: James Morris <jmorris@namei.org>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Mickaël Salaün <mic@linux.microsoft.com>
Link: https://lore.kernel.org/r/20210422154123.13086-14-mic@digikod.net
Signed-off-by: James Morris <jamorris@linux.microsoft.com>
---
 include/uapi/linux/landlock.h                |  8 +++++
 security/landlock/syscalls.c                 | 17 +++++++---
 tools/testing/selftests/landlock/base_test.c | 47 ++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index ba946a1e40b2..b3d952067f59 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -27,6 +27,14 @@ struct landlock_ruleset_attr {
 	__u64 handled_access_fs;
 };
 
+/*
+ * sys_landlock_create_ruleset() flags:
+ *
+ * - %LANDLOCK_CREATE_RULESET_VERSION: Get the highest supported Landlock ABI
+ *   version.
+ */
+#define LANDLOCK_CREATE_RULESET_VERSION			(1U << 0)
+
 /**
  * enum landlock_rule_type - Landlock rule type
  *
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 93620ad7593b..32396962f04d 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -128,6 +128,8 @@ static const struct file_operations ruleset_fops = {
 	.write = fop_dummy_write,
 };
 
+#define LANDLOCK_ABI_VERSION	1
+
 /**
  * sys_landlock_create_ruleset - Create a new ruleset
  *
@@ -135,15 +137,19 @@ static const struct file_operations ruleset_fops = {
  *        the new ruleset.
  * @size: Size of the pointed &struct landlock_ruleset_attr (needed for
  *        backward and forward compatibility).
- * @flags: Must be 0.
+ * @flags: Supported value: %LANDLOCK_CREATE_RULESET_VERSION.
  *
  * This system call enables to create a new Landlock ruleset, and returns the
  * related file descriptor on success.
  *
+ * If @flags is %LANDLOCK_CREATE_RULESET_VERSION and @attr is NULL and @size is
+ * 0, then the returned value is the highest supported Landlock ABI version
+ * (starting at 1).
+ *
  * Possible returned errors are:
  *
  * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
- * - EINVAL: @flags is not 0, or unknown access, or too small @size;
+ * - EINVAL: unknown @flags, or unknown access, or too small @size;
  * - E2BIG or EFAULT: @attr or @size inconsistencies;
  * - ENOMSG: empty &landlock_ruleset_attr.handled_access_fs.
  */
@@ -161,9 +167,12 @@ SYSCALL_DEFINE3(landlock_create_ruleset,
 	if (!landlock_initialized)
 		return -EOPNOTSUPP;
 
-	/* No flag for now. */
-	if (flags)
+	if (flags) {
+		if ((flags == LANDLOCK_CREATE_RULESET_VERSION)
+				&& !attr && !size)
+			return LANDLOCK_ABI_VERSION;
 		return -EINVAL;
+	}
 
 	/* Copies raw user space buffer. */
 	err = copy_min_struct_from_user(&ruleset_attr, sizeof(ruleset_attr),
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 262c3c8d953a..ca40abe9daa8 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -63,6 +63,53 @@ TEST(inconsistent_attr) {
 	free(buf);
 }
 
+TEST(abi_version) {
+	const struct landlock_ruleset_attr ruleset_attr = {
+		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
+	};
+	ASSERT_EQ(1, landlock_create_ruleset(NULL, 0,
+				LANDLOCK_CREATE_RULESET_VERSION));
+
+	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0,
+				LANDLOCK_CREATE_RULESET_VERSION));
+	ASSERT_EQ(EINVAL, errno);
+
+	ASSERT_EQ(-1, landlock_create_ruleset(NULL, sizeof(ruleset_attr),
+				LANDLOCK_CREATE_RULESET_VERSION));
+	ASSERT_EQ(EINVAL, errno);
+
+	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr,
+				sizeof(ruleset_attr),
+				LANDLOCK_CREATE_RULESET_VERSION));
+	ASSERT_EQ(EINVAL, errno);
+
+	ASSERT_EQ(-1, landlock_create_ruleset(NULL, 0,
+				LANDLOCK_CREATE_RULESET_VERSION | 1 << 31));
+	ASSERT_EQ(EINVAL, errno);
+}
+
+TEST(inval_create_ruleset_flags) {
+	const int last_flag = LANDLOCK_CREATE_RULESET_VERSION;
+	const int invalid_flag = last_flag << 1;
+	const struct landlock_ruleset_attr ruleset_attr = {
+		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
+	};
+
+	ASSERT_EQ(-1, landlock_create_ruleset(NULL, 0, invalid_flag));
+	ASSERT_EQ(EINVAL, errno);
+
+	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, invalid_flag));
+	ASSERT_EQ(EINVAL, errno);
+
+	ASSERT_EQ(-1, landlock_create_ruleset(NULL, sizeof(ruleset_attr),
+				invalid_flag));
+	ASSERT_EQ(EINVAL, errno);
+
+	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr,
+				sizeof(ruleset_attr), invalid_flag));
+	ASSERT_EQ(EINVAL, errno);
+}
+
 TEST(empty_path_beneath_attr) {
 	const struct landlock_ruleset_attr ruleset_attr = {
 		.handled_access_fs = LANDLOCK_ACCESS_FS_EXECUTE,
-- 
cgit v1.2.3-71-gd317


From fdecb66281e165927059419c3b1de09ffe4f8369 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:20 +0100
Subject: io_uring: enumerate dynamic resources

As resources are getting more support and common parts, it'll be more
convenient to index resources and use it for indexing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f0be63e9310212d5601d36277c2946ff7a040485.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 16 ++++++++--------
 include/uapi/linux/io_uring.h |  4 ++++
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 86a957d1059b..36b9651588f9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1035,7 +1035,7 @@ static void io_dismantle_req(struct io_kiocb *req);
 static void io_put_task(struct task_struct *task, int nr);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     struct io_uring_rsrc_update *up,
 				     unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
@@ -5824,7 +5824,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	up.data = req->rsrc_update.arg;
 
 	mutex_lock(&ctx->uring_lock);
-	ret = __io_register_rsrc_update(ctx, IORING_REGISTER_FILES_UPDATE,
+	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
 					&up, req->rsrc_update.nr_args);
 	mutex_unlock(&ctx->uring_lock);
 
@@ -9709,7 +9709,7 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
 	return 0;
 }
 
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				     struct io_uring_rsrc_update *up,
 				     unsigned nr_args)
 {
@@ -9722,14 +9722,14 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
 	if (err)
 		return err;
 
-	switch (opcode) {
-	case IORING_REGISTER_FILES_UPDATE:
+	switch (type) {
+	case IORING_RSRC_FILE:
 		return __io_sqe_files_update(ctx, up, nr_args);
 	}
 	return -EINVAL;
 }
 
-static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
+static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 				   void __user *arg, unsigned nr_args)
 {
 	struct io_uring_rsrc_update up;
@@ -9740,7 +9740,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned opcode,
 		return -EFAULT;
 	if (up.resv)
 		return -EINVAL;
-	return __io_register_rsrc_update(ctx, opcode, &up, nr_args);
+	return __io_register_rsrc_update(ctx, type, &up, nr_args);
 }
 
 static bool io_register_op_must_quiesce(int op)
@@ -9829,7 +9829,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_sqe_files_unregister(ctx);
 		break;
 	case IORING_REGISTER_FILES_UPDATE:
-		ret = io_register_rsrc_update(ctx, opcode, arg, nr_args);
+		ret = io_register_rsrc_update(ctx, IORING_RSRC_FILE, arg, nr_args);
 		break;
 	case IORING_REGISTER_EVENTFD:
 	case IORING_REGISTER_EVENTFD_ASYNC:
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5beaa6bbc6db..d363e0c4fd21 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -316,6 +316,10 @@ struct io_uring_rsrc_update {
 	__aligned_u64 data;
 };
 
+enum {
+	IORING_RSRC_FILE		= 0,
+};
+
 /* Skip updating fd indexes set to this value in the fd table */
 #define IORING_REGISTER_FILES_SKIP	(-2)
 
-- 
cgit v1.2.3-71-gd317


From 792e35824be9af9fb4dac956229fb97bda04e25e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:21 +0100
Subject: io_uring: add IORING_REGISTER_RSRC

Add a new io_uring_register() opcode for rsrc registeration. Instead of
accepting a pointer to resources, fds or iovecs, it @arg is now pointing
to a struct io_uring_rsrc_register, and the second argument tells how
large that struct is to make it easily extendible by adding new fields.

All that is done mainly to be able to pass in a pointer with tags. Pass
it in and enable CQE posting for file resources. Doesn't support setting
tags on update yet.

A design choice made here is to not post CQEs on rsrc de-registration,
but only when we updated-removed it by rsrc dynamic update.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c498aaec32a4bb277b2406b9069662c02cdda98c.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 45 ++++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/io_uring.h |  8 ++++++++
 2 files changed, 48 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 36b9651588f9..0088eeef82ec 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7589,7 +7589,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 }
 
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
-				 unsigned nr_args)
+				 unsigned nr_args, u64 __user *tags)
 {
 	__s32 __user *fds = (__s32 __user *) arg;
 	struct file *file;
@@ -7616,17 +7616,24 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+		u64 tag = 0;
+
+		if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
+		    copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto out_fput;
 		}
 		/* allow sparse sets */
-		if (fd == -1)
+		if (fd == -1) {
+			ret = -EINVAL;
+			if (unlikely(tag))
+				goto out_fput;
 			continue;
+		}
 
 		file = fget(fd);
 		ret = -EBADF;
-		if (!file)
+		if (unlikely(!file))
 			goto out_fput;
 
 		/*
@@ -7640,6 +7647,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto out_fput;
 		}
+		ctx->file_data->tags[i] = tag;
 		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
 	}
 
@@ -9743,6 +9751,29 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 	return __io_register_rsrc_update(ctx, type, &up, nr_args);
 }
 
+static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+			    unsigned int size)
+{
+	struct io_uring_rsrc_register rr;
+
+	/* keep it extendible */
+	if (size != sizeof(rr))
+		return -EINVAL;
+
+	memset(&rr, 0, sizeof(rr));
+	if (copy_from_user(&rr, arg, size))
+		return -EFAULT;
+	if (!rr.nr)
+		return -EINVAL;
+
+	switch (rr.type) {
+	case IORING_RSRC_FILE:
+		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
+					     rr.nr, u64_to_user_ptr(rr.tags));
+	}
+	return -EINVAL;
+}
+
 static bool io_register_op_must_quiesce(int op)
 {
 	switch (op) {
@@ -9752,6 +9783,7 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_REGISTER_PROBE:
 	case IORING_REGISTER_PERSONALITY:
 	case IORING_UNREGISTER_PERSONALITY:
+	case IORING_REGISTER_RSRC:
 		return false;
 	default:
 		return true;
@@ -9820,7 +9852,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_sqe_buffers_unregister(ctx);
 		break;
 	case IORING_REGISTER_FILES:
-		ret = io_sqe_files_register(ctx, arg, nr_args);
+		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
 		break;
 	case IORING_UNREGISTER_FILES:
 		ret = -EINVAL;
@@ -9877,6 +9909,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	case IORING_REGISTER_RESTRICTIONS:
 		ret = io_register_restrictions(ctx, arg, nr_args);
 		break;
+	case IORING_REGISTER_RSRC:
+		ret = io_register_rsrc(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d363e0c4fd21..ce7b2fce6713 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -298,6 +298,7 @@ enum {
 	IORING_UNREGISTER_PERSONALITY		= 10,
 	IORING_REGISTER_RESTRICTIONS		= 11,
 	IORING_REGISTER_ENABLE_RINGS		= 12,
+	IORING_REGISTER_RSRC			= 13,
 
 	/* this goes last */
 	IORING_REGISTER_LAST
@@ -320,6 +321,13 @@ enum {
 	IORING_RSRC_FILE		= 0,
 };
 
+struct io_uring_rsrc_register {
+	__u32 type;
+	__u32 nr;
+	__aligned_u64 data;
+	__aligned_u64 tags;
+};
+
 /* Skip updating fd indexes set to this value in the fd table */
 #define IORING_REGISTER_FILES_SKIP	(-2)
 
-- 
cgit v1.2.3-71-gd317


From c3bdad0271834214be01c1d687c262bf80da6eb0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:22 +0100
Subject: io_uring: add generic rsrc update with tags

Add IORING_REGISTER_RSRC_UPDATE, which also supports passing in rsrc
tags. Implement it for registered files.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d4dc66df204212f64835ffca2c4eb5e8363f2f05.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 52 ++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/io_uring.h | 22 +++++++++++++-----
 2 files changed, 57 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0088eeef82ec..a4c37a2cc690 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1036,7 +1036,7 @@ static void io_put_task(struct task_struct *task, int nr);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
-				     struct io_uring_rsrc_update *up,
+				     struct io_uring_rsrc_update2 *up,
 				     unsigned nr_args);
 static void io_clean_op(struct io_kiocb *req);
 static struct file *io_file_get(struct io_submit_state *state,
@@ -5814,7 +5814,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_uring_rsrc_update up;
+	struct io_uring_rsrc_update2 up;
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -5822,6 +5822,8 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 
 	up.offset = req->rsrc_update.offset;
 	up.data = req->rsrc_update.arg;
+	up.nr = 0;
+	up.tags = 0;
 
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
@@ -7732,9 +7734,10 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-				 struct io_uring_rsrc_update *up,
+				 struct io_uring_rsrc_update2 *up,
 				 unsigned nr_args)
 {
+	u64 __user *tags = u64_to_user_ptr(up->tags);
 	__s32 __user *fds = u64_to_user_ptr(up->data);
 	struct io_rsrc_data *data = ctx->file_data;
 	struct io_fixed_file *file_slot;
@@ -7749,10 +7752,17 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
-		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+		u64 tag = 0;
+
+		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
+		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
+			err = -EINVAL;
+			break;
+		}
 		if (fd == IORING_REGISTER_FILES_SKIP)
 			continue;
 
@@ -7787,6 +7797,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
+			data->tags[up->offset + done] = tag;
 			io_fixed_file_set(file_slot, file);
 			err = io_sqe_file_register(ctx, file, i);
 			if (err) {
@@ -9718,12 +9729,14 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
 }
 
 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
-				     struct io_uring_rsrc_update *up,
+				     struct io_uring_rsrc_update2 *up,
 				     unsigned nr_args)
 {
 	__u32 tmp;
 	int err;
 
+	if (up->resv)
+		return -EINVAL;
 	if (check_add_overflow(up->offset, nr_args, &tmp))
 		return -EOVERFLOW;
 	err = io_rsrc_node_switch_start(ctx);
@@ -9737,18 +9750,31 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 	return -EINVAL;
 }
 
-static int io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
-				   void __user *arg, unsigned nr_args)
+static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
+				    unsigned nr_args)
 {
-	struct io_uring_rsrc_update up;
+	struct io_uring_rsrc_update2 up;
 
 	if (!nr_args)
 		return -EINVAL;
+	memset(&up, 0, sizeof(up));
+	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
+		return -EFAULT;
+	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
+}
+
+static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
+				   unsigned size)
+{
+	struct io_uring_rsrc_update2 up;
+
+	if (size != sizeof(up))
+		return -EINVAL;
 	if (copy_from_user(&up, arg, sizeof(up)))
 		return -EFAULT;
-	if (up.resv)
+	if (!up.nr)
 		return -EINVAL;
-	return __io_register_rsrc_update(ctx, type, &up, nr_args);
+	return __io_register_rsrc_update(ctx, up.type, &up, up.nr);
 }
 
 static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
@@ -9784,6 +9810,7 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_REGISTER_PERSONALITY:
 	case IORING_UNREGISTER_PERSONALITY:
 	case IORING_REGISTER_RSRC:
+	case IORING_REGISTER_RSRC_UPDATE:
 		return false;
 	default:
 		return true;
@@ -9861,7 +9888,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_sqe_files_unregister(ctx);
 		break;
 	case IORING_REGISTER_FILES_UPDATE:
-		ret = io_register_rsrc_update(ctx, IORING_RSRC_FILE, arg, nr_args);
+		ret = io_register_files_update(ctx, arg, nr_args);
 		break;
 	case IORING_REGISTER_EVENTFD:
 	case IORING_REGISTER_EVENTFD_ASYNC:
@@ -9912,6 +9939,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	case IORING_REGISTER_RSRC:
 		ret = io_register_rsrc(ctx, arg, nr_args);
 		break;
+	case IORING_REGISTER_RSRC_UPDATE:
+		ret = io_register_rsrc_update(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ce7b2fce6713..6d8360b5b9c5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -299,6 +299,7 @@ enum {
 	IORING_REGISTER_RESTRICTIONS		= 11,
 	IORING_REGISTER_ENABLE_RINGS		= 12,
 	IORING_REGISTER_RSRC			= 13,
+	IORING_REGISTER_RSRC_UPDATE		= 14,
 
 	/* this goes last */
 	IORING_REGISTER_LAST
@@ -311,12 +312,6 @@ struct io_uring_files_update {
 	__aligned_u64 /* __s32 * */ fds;
 };
 
-struct io_uring_rsrc_update {
-	__u32 offset;
-	__u32 resv;
-	__aligned_u64 data;
-};
-
 enum {
 	IORING_RSRC_FILE		= 0,
 };
@@ -328,6 +323,21 @@ struct io_uring_rsrc_register {
 	__aligned_u64 tags;
 };
 
+struct io_uring_rsrc_update {
+	__u32 offset;
+	__u32 resv;
+	__aligned_u64 data;
+};
+
+struct io_uring_rsrc_update2 {
+	__u32 offset;
+	__u32 resv;
+	__aligned_u64 data;
+	__aligned_u64 tags;
+	__u32 type;
+	__u32 nr;
+};
+
 /* Skip updating fd indexes set to this value in the fd table */
 #define IORING_REGISTER_FILES_SKIP	(-2)
 
-- 
cgit v1.2.3-71-gd317


From 634d00df5e1cfc4a707b629a814bd607f726bd52 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 25 Apr 2021 14:32:26 +0100
Subject: io_uring: add full-fledged dynamic buffers support

Hook buffers into all rsrc infrastructure, including tagging and
updates.

Suggested-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/119ed51d68a491dae87eb55fb467a47870c86aad.1619356238.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 76 ++++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 73 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0680d241fedc..0ab09f7a44e7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8114,8 +8114,8 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 
 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 {
-	/* no updates yet, so not used */
-	WARN_ON_ONCE(1);
+	io_buffer_unmap(ctx, &prsrc->buf);
+	prsrc->buf = NULL;
 }
 
 static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
@@ -8359,7 +8359,7 @@ static int io_buffer_validate(struct iovec *iov)
 }
 
 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
-				   unsigned int nr_args)
+				   unsigned int nr_args, u64 __user *tags)
 {
 	struct page *last_hpage = NULL;
 	struct io_rsrc_data *data;
@@ -8383,6 +8383,12 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+		u64 tag = 0;
+
+		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
+			ret = -EFAULT;
+			break;
+		}
 		ret = io_copy_iov(ctx, &iov, arg, i);
 		if (ret)
 			break;
@@ -8394,6 +8400,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 					     &last_hpage);
 		if (ret)
 			break;
+		data->tags[i] = tag;
 	}
 
 	WARN_ON_ONCE(ctx->buf_data);
@@ -8406,6 +8413,62 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
+static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+				   struct io_uring_rsrc_update2 *up,
+				   unsigned int nr_args)
+{
+	u64 __user *tags = u64_to_user_ptr(up->tags);
+	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
+	struct io_mapped_ubuf *imu;
+	struct page *last_hpage = NULL;
+	bool needs_switch = false;
+	__u32 done;
+	int i, err;
+
+	if (!ctx->buf_data)
+		return -ENXIO;
+	if (up->offset + nr_args > ctx->nr_user_bufs)
+		return -EINVAL;
+
+	for (done = 0; done < nr_args; done++) {
+		u64 tag = 0;
+
+		err = io_copy_iov(ctx, &iov, iovs, done);
+		if (err)
+			break;
+		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
+			err = -EFAULT;
+			break;
+		}
+
+		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
+		imu = ctx->user_bufs[i];
+		if (imu) {
+			err = io_queue_rsrc_removal(ctx->buf_data, up->offset + done,
+						    ctx->rsrc_node, imu);
+			if (err)
+				break;
+			ctx->user_bufs[i] = NULL;
+			needs_switch = true;
+		}
+
+		if (iov.iov_base || iov.iov_len) {
+			err = io_buffer_validate(&iov);
+			if (err)
+				break;
+			err = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+						     &last_hpage);
+			if (err)
+				break;
+			ctx->buf_data->tags[up->offset + done] = tag;
+		}
+	}
+
+	if (needs_switch)
+		io_rsrc_node_switch(ctx, ctx->buf_data);
+	return done ? done : err;
+}
+
 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
 {
 	__s32 __user *fds = arg;
@@ -9807,6 +9870,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 	switch (type) {
 	case IORING_RSRC_FILE:
 		return __io_sqe_files_update(ctx, up, nr_args);
+	case IORING_RSRC_BUFFER:
+		return __io_sqe_buffers_update(ctx, up, nr_args);
 	}
 	return -EINVAL;
 }
@@ -9857,6 +9922,9 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 	case IORING_RSRC_FILE:
 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
 					     rr.nr, u64_to_user_ptr(rr.tags));
+	case IORING_RSRC_BUFFER:
+		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
+					       rr.nr, u64_to_user_ptr(rr.tags));
 	}
 	return -EINVAL;
 }
@@ -9933,7 +10001,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 
 	switch (opcode) {
 	case IORING_REGISTER_BUFFERS:
-		ret = io_sqe_buffers_register(ctx, arg, nr_args);
+		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
 		break;
 	case IORING_UNREGISTER_BUFFERS:
 		ret = -EINVAL;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6d8360b5b9c5..e1ae46683301 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -314,6 +314,7 @@ struct io_uring_files_update {
 
 enum {
 	IORING_RSRC_FILE		= 0,
+	IORING_RSRC_BUFFER		= 1,
 };
 
 struct io_uring_rsrc_register {
-- 
cgit v1.2.3-71-gd317


From e0bb96db96f8ca94349344a2ea7bebc6f8cefdae Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 21 Apr 2021 01:12:44 +0200
Subject: netfilter: nft_socket: add support for cgroupsv2

Allow to match on the cgroupsv2 id from ancestor level.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  4 +++
 net/netfilter/nft_socket.c               | 48 +++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 79bab7a36b30..467365ed59a7 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1014,11 +1014,13 @@ enum nft_rt_attributes {
  *
  * @NFTA_SOCKET_KEY: socket key to match
  * @NFTA_SOCKET_DREG: destination register
+ * @NFTA_SOCKET_LEVEL: cgroups2 ancestor level (only for cgroupsv2)
  */
 enum nft_socket_attributes {
 	NFTA_SOCKET_UNSPEC,
 	NFTA_SOCKET_KEY,
 	NFTA_SOCKET_DREG,
+	NFTA_SOCKET_LEVEL,
 	__NFTA_SOCKET_MAX
 };
 #define NFTA_SOCKET_MAX		(__NFTA_SOCKET_MAX - 1)
@@ -1029,11 +1031,13 @@ enum nft_socket_attributes {
  * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option
  * @NFT_SOCKET_MARK: Value of the socket mark
  * @NFT_SOCKET_WILDCARD: Whether the socket is zero-bound (e.g. 0.0.0.0 or ::0)
+ * @NFT_SOCKET_CGROUPV2: Match on cgroups version 2
  */
 enum nft_socket_keys {
 	NFT_SOCKET_TRANSPARENT,
 	NFT_SOCKET_MARK,
 	NFT_SOCKET_WILDCARD,
+	NFT_SOCKET_CGROUPV2,
 	__NFT_SOCKET_MAX
 };
 #define NFT_SOCKET_MAX	(__NFT_SOCKET_MAX - 1)
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index c9b8a2b03b71..9c169d100651 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -9,6 +9,7 @@
 
 struct nft_socket {
 	enum nft_socket_keys		key:8;
+	u8				level;
 	union {
 		u8			dreg;
 	};
@@ -33,6 +34,26 @@ static void nft_socket_wildcard(const struct nft_pktinfo *pkt,
 	}
 }
 
+#ifdef CONFIG_CGROUPS
+static noinline bool
+nft_sock_get_eval_cgroupv2(u32 *dest, const struct nft_pktinfo *pkt, u32 level)
+{
+	struct sock *sk = skb_to_full_sk(pkt->skb);
+	struct cgroup *cgrp;
+
+	if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk)))
+		return false;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	if (level > cgrp->level)
+		return false;
+
+	memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64));
+
+	return true;
+}
+#endif
+
 static void nft_socket_eval(const struct nft_expr *expr,
 			    struct nft_regs *regs,
 			    const struct nft_pktinfo *pkt)
@@ -85,6 +106,14 @@ static void nft_socket_eval(const struct nft_expr *expr,
 		}
 		nft_socket_wildcard(pkt, regs, sk, dest);
 		break;
+#ifdef CONFIG_CGROUPS
+	case NFT_SOCKET_CGROUPV2:
+		if (!nft_sock_get_eval_cgroupv2(dest, pkt, priv->level)) {
+			regs->verdict.code = NFT_BREAK;
+			return;
+		}
+		break;
+#endif
 	default:
 		WARN_ON(1);
 		regs->verdict.code = NFT_BREAK;
@@ -97,6 +126,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
 static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
 	[NFTA_SOCKET_KEY]		= { .type = NLA_U32 },
 	[NFTA_SOCKET_DREG]		= { .type = NLA_U32 },
+	[NFTA_SOCKET_LEVEL]		= { .type = NLA_U32 },
 };
 
 static int nft_socket_init(const struct nft_ctx *ctx,
@@ -104,7 +134,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
 			   const struct nlattr * const tb[])
 {
 	struct nft_socket *priv = nft_expr_priv(expr);
-	unsigned int len;
+	unsigned int len, level;
 
 	if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY])
 		return -EINVAL;
@@ -129,6 +159,19 @@ static int nft_socket_init(const struct nft_ctx *ctx,
 	case NFT_SOCKET_MARK:
 		len = sizeof(u32);
 		break;
+#ifdef CONFIG_CGROUPS
+	case NFT_SOCKET_CGROUPV2:
+		if (!tb[NFTA_SOCKET_LEVEL])
+			return -EINVAL;
+
+		level = ntohl(nla_get_u32(tb[NFTA_SOCKET_LEVEL]));
+		if (level > 255)
+			return -EOPNOTSUPP;
+
+		priv->level = level;
+		len = sizeof(u64);
+		break;
+#endif
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -146,6 +189,9 @@ static int nft_socket_dump(struct sk_buff *skb,
 		return -1;
 	if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
 		return -1;
+	if (priv->key == NFT_SOCKET_CGROUPV2 &&
+	    nla_put_u32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level)))
+		return -1;
 	return 0;
 }
 
-- 
cgit v1.2.3-71-gd317


From 427f0c8c194b22edcafef1b0a42995ddc5c2227d Mon Sep 17 00:00:00 2001
From: Jethro Beekman <kernel@jbeekman.nl>
Date: Sun, 25 Apr 2021 11:22:03 +0200
Subject: macvlan: Add nodst option to macvlan type source

The default behavior for source MACVLAN is to duplicate packets to
appropriate type source devices, and then do the normal destination MACVLAN
flow. This patch adds an option to skip destination MACVLAN processing if
any matching source MACVLAN device has the option set.

This allows setting up a "catch all" device for source MACVLAN: create one
or more devices with type source nodst, and one device with e.g. type vepa,
and incoming traffic will be received on exactly one device.

v2: netdev wants non-standard line length

Signed-off-by: Jethro Beekman <kernel@jbeekman.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c        | 19 ++++++++++++++-----
 include/uapi/linux/if_link.h |  1 +
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 9a9a5cf36a4b..7427b989607e 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -423,18 +423,24 @@ static void macvlan_forward_source_one(struct sk_buff *skb,
 	macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
 }
 
-static void macvlan_forward_source(struct sk_buff *skb,
+static bool macvlan_forward_source(struct sk_buff *skb,
 				   struct macvlan_port *port,
 				   const unsigned char *addr)
 {
 	struct macvlan_source_entry *entry;
 	u32 idx = macvlan_eth_hash(addr);
 	struct hlist_head *h = &port->vlan_source_hash[idx];
+	bool consume = false;
 
 	hlist_for_each_entry_rcu(entry, h, hlist) {
-		if (ether_addr_equal_64bits(entry->addr, addr))
+		if (ether_addr_equal_64bits(entry->addr, addr)) {
+			if (entry->vlan->flags & MACVLAN_FLAG_NODST)
+				consume = true;
 			macvlan_forward_source_one(skb, entry->vlan);
+		}
 	}
+
+	return consume;
 }
 
 /* called under rcu_read_lock() from netif_receive_skb */
@@ -463,7 +469,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 			return RX_HANDLER_CONSUMED;
 		*pskb = skb;
 		eth = eth_hdr(skb);
-		macvlan_forward_source(skb, port, eth->h_source);
+		if (macvlan_forward_source(skb, port, eth->h_source))
+			return RX_HANDLER_CONSUMED;
 		src = macvlan_hash_lookup(port, eth->h_source);
 		if (src && src->mode != MACVLAN_MODE_VEPA &&
 		    src->mode != MACVLAN_MODE_BRIDGE) {
@@ -482,7 +489,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
 		return RX_HANDLER_PASS;
 	}
 
-	macvlan_forward_source(skb, port, eth->h_source);
+	if (macvlan_forward_source(skb, port, eth->h_source))
+		return RX_HANDLER_CONSUMED;
 	if (macvlan_passthru(port))
 		vlan = list_first_or_null_rcu(&port->vlans,
 					      struct macvlan_dev, list);
@@ -1286,7 +1294,8 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
 		return 0;
 
 	if (data[IFLA_MACVLAN_FLAGS] &&
-	    nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~MACVLAN_FLAG_NOPROMISC)
+	    nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC |
+						      MACVLAN_FLAG_NODST))
 		return -EINVAL;
 
 	if (data[IFLA_MACVLAN_MODE]) {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 91c8dda6d95d..cd5b382a4138 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -614,6 +614,7 @@ enum macvlan_macaddr_mode {
 };
 
 #define MACVLAN_FLAG_NOPROMISC	1
+#define MACVLAN_FLAG_NODST	2 /* skip dst macvlan if matching src macvlan */
 
 /* VRF section */
 enum {
-- 
cgit v1.2.3-71-gd317


From d83e682e301071313e390e2f5ba2f6ca2ebc1848 Mon Sep 17 00:00:00 2001
From: Nick Kossifidis <mick@ics.forth.gr>
Date: Mon, 19 Apr 2021 03:55:35 +0300
Subject: RISC-V: Add EM_RISCV to kexec UAPI header

Add RISC-V to the list of supported kexec architectures, we need to
add the definition early-on so that later patches can use it.

EM_RISCV is 243 as per ELF psABI specification here:
https://github.com/riscv/riscv-elf-psabi-doc/blob/master/riscv-elf.md

Signed-off-by: Nick Kossifidis <mick@ics.forth.gr>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 include/uapi/linux/kexec.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 05669c87a0af..778dc191c265 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -42,6 +42,7 @@
 #define KEXEC_ARCH_MIPS_LE (10 << 16)
 #define KEXEC_ARCH_MIPS    ( 8 << 16)
 #define KEXEC_ARCH_AARCH64 (183 << 16)
+#define KEXEC_ARCH_RISCV   (243 << 16)
 
 /* The artificial cap on the number of segments passed to kexec_load. */
 #define KEXEC_SEGMENT_MAX 16
-- 
cgit v1.2.3-71-gd317


From aaa31047a6d25da0fa101da1ed544e1247949b40 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 27 Apr 2021 18:05:55 +0200
Subject: netfilter: nftables: add catch-all set element support

This patch extends the set infrastructure to add a special catch-all set
element. If the lookup fails to find an element (or range) in the set,
then the catch-all element is selected. Users can specify a mapping,
expression(s) and timeout to be attached to the catch-all element.

This patch adds a catchall list to the set, this list might contain more
than one single catch-all element (e.g. in case that the catch-all
element is removed and a new one is added in the same transaction).
However, most of the time, there will be either one element or no
elements at all in this list.

The catch-all element is identified via NFT_SET_ELEM_CATCHALL flag and
such special element has no NFTA_SET_ELEM_KEY attribute. There is a new
nft_set_elem_catchall object that stores a reference to the dummy
catch-all element (catchall->elem) whose layout is the same of the set
element type to reuse the existing set element codebase.

The set size does not apply to the catch-all element, users can define a
catch-all element even if the set is full.

The check for valid set element flags hava been updates to report
EOPNOTSUPP in case userspace requests flags that are not supported when
using new userspace nftables and old kernel.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |   5 +
 include/uapi/linux/netfilter/nf_tables.h |   2 +
 net/netfilter/nf_tables_api.c            | 480 +++++++++++++++++++++++++++----
 net/netfilter/nft_lookup.c               |  12 +-
 net/netfilter/nft_objref.c               |  11 +-
 net/netfilter/nft_set_hash.c             |   6 +
 net/netfilter/nft_set_pipapo.c           |   6 +-
 net/netfilter/nft_set_rbtree.c           |   6 +
 8 files changed, 465 insertions(+), 63 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index eb708b77c4a5..27eeb613bb4e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -497,6 +497,7 @@ struct nft_set {
 	u8				dlen;
 	u8				num_exprs;
 	struct nft_expr			*exprs[NFT_SET_EXPR_MAX];
+	struct list_head		catchall_list;
 	unsigned char			data[]
 		__attribute__((aligned(__alignof__(u64))));
 };
@@ -522,6 +523,10 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
 				      const struct nlattr *nla_set_id,
 				      u8 genmask);
 
+struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
+					    const struct nft_set *set);
+void *nft_set_catchall_gc(const struct nft_set *set);
+
 static inline unsigned long nft_set_gc_interval(const struct nft_set *set)
 {
 	return set->gc_int ? msecs_to_jiffies(set->gc_int) : HZ;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 467365ed59a7..1fb4ca18ffbb 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -398,9 +398,11 @@ enum nft_set_attributes {
  * enum nft_set_elem_flags - nf_tables set element flags
  *
  * @NFT_SET_ELEM_INTERVAL_END: element ends the previous interval
+ * @NFT_SET_ELEM_CATCHALL: special catch-all element
  */
 enum nft_set_elem_flags {
 	NFT_SET_ELEM_INTERVAL_END	= 0x1,
+	NFT_SET_ELEM_CATCHALL		= 0x2,
 };
 
 /**
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index faf0424375e8..0b7fe0a902ff 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4389,6 +4389,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
 	}
 
 	INIT_LIST_HEAD(&set->bindings);
+	INIT_LIST_HEAD(&set->catchall_list);
 	set->table = table;
 	write_pnet(&set->net, net);
 	set->ops   = ops;
@@ -4434,6 +4435,24 @@ err_set_name:
 	return err;
 }
 
+struct nft_set_elem_catchall {
+	struct list_head	list;
+	struct rcu_head		rcu;
+	void			*elem;
+};
+
+static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
+				     struct nft_set *set)
+{
+	struct nft_set_elem_catchall *catchall;
+
+	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+		list_del_rcu(&catchall->list);
+		nft_set_elem_destroy(set, catchall->elem, true);
+		kfree_rcu(catchall);
+	}
+}
+
 static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
 {
 	int i;
@@ -4445,6 +4464,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
 		nft_expr_destroy(ctx, set->exprs[i]);
 
 	set->ops->destroy(set);
+	nft_set_catchall_destroy(ctx, set);
 	kfree(set->name);
 	kvfree(set);
 }
@@ -4521,6 +4541,29 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
 	return nft_setelem_data_validate(ctx, set, elem);
 }
 
+static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
+				       struct nft_set *set)
+{
+	u8 genmask = nft_genmask_next(ctx->net);
+	struct nft_set_elem_catchall *catchall;
+	struct nft_set_elem elem;
+	struct nft_set_ext *ext;
+	int ret = 0;
+
+	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+		if (!nft_set_elem_active(ext, genmask))
+			continue;
+
+		elem.priv = catchall->elem;
+		ret = nft_setelem_data_validate(ctx, set, &elem);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
 int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 		       struct nft_set_binding *binding)
 {
@@ -4550,6 +4593,9 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 		iter.fn		= nf_tables_bind_check_setelem;
 
 		set->ops->walk(ctx, set, &iter);
+		if (!iter.err)
+			iter.err = nft_set_catchall_bind_check(ctx, set);
+
 		if (iter.err < 0)
 			return iter.err;
 	}
@@ -4736,7 +4782,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
 	if (nest == NULL)
 		goto nla_put_failure;
 
-	if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
+	    nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
 			  NFT_DATA_VALUE, set->klen) < 0)
 		goto nla_put_failure;
 
@@ -4825,6 +4872,29 @@ struct nft_set_dump_ctx {
 	struct nft_ctx		ctx;
 };
 
+static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
+				 const struct nft_set *set)
+{
+	struct nft_set_elem_catchall *catchall;
+	u8 genmask = nft_genmask_cur(net);
+	struct nft_set_elem elem;
+	struct nft_set_ext *ext;
+	int ret = 0;
+
+	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+		if (!nft_set_elem_active(ext, genmask) ||
+		    nft_set_elem_expired(ext))
+			continue;
+
+		elem.priv = catchall->elem;
+		ret = nf_tables_fill_setelem(skb, set, &elem);
+		break;
+	}
+
+	return ret;
+}
+
 static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct nft_set_dump_ctx *dump_ctx = cb->data;
@@ -4889,6 +4959,9 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	args.iter.err		= 0;
 	args.iter.fn		= nf_tables_dump_setelem;
 	set->ops->walk(&dump_ctx->ctx, set, &args.iter);
+
+	if (!args.iter.err && args.iter.count == cb->args[0])
+		args.iter.err = nft_set_catchall_dump(net, skb, set);
 	rcu_read_unlock();
 
 	nla_nest_end(skb, nest);
@@ -4968,8 +5041,8 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
 		return 0;
 
 	*flags = ntohl(nla_get_be32(attr));
-	if (*flags & ~NFT_SET_ELEM_INTERVAL_END)
-		return -EINVAL;
+	if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
+		return -EOPNOTSUPP;
 	if (!(set->flags & NFT_SET_INTERVAL) &&
 	    *flags & NFT_SET_ELEM_INTERVAL_END)
 		return -EINVAL;
@@ -5014,6 +5087,46 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
 	return 0;
 }
 
+static void *nft_setelem_catchall_get(const struct net *net,
+				      const struct nft_set *set)
+{
+	struct nft_set_elem_catchall *catchall;
+	u8 genmask = nft_genmask_cur(net);
+	struct nft_set_ext *ext;
+	void *priv = NULL;
+
+	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+		if (!nft_set_elem_active(ext, genmask) ||
+		    nft_set_elem_expired(ext))
+			continue;
+
+		priv = catchall->elem;
+		break;
+	}
+
+	return priv;
+}
+
+static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set,
+			   struct nft_set_elem *elem, u32 flags)
+{
+	void *priv;
+
+	if (!(flags & NFT_SET_ELEM_CATCHALL)) {
+		priv = set->ops->get(ctx->net, set, elem, flags);
+		if (IS_ERR(priv))
+			return PTR_ERR(priv);
+	} else {
+		priv = nft_setelem_catchall_get(ctx->net, set);
+		if (!priv)
+			return -ENOENT;
+	}
+	elem->priv = priv;
+
+	return 0;
+}
+
 static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			    const struct nlattr *attr)
 {
@@ -5021,7 +5134,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	struct nft_set_elem elem;
 	struct sk_buff *skb;
 	uint32_t flags = 0;
-	void *priv;
 	int err;
 
 	err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
@@ -5029,17 +5141,19 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	if (err < 0)
 		return err;
 
-	if (!nla[NFTA_SET_ELEM_KEY])
-		return -EINVAL;
-
 	err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
 	if (err < 0)
 		return err;
 
-	err = nft_setelem_parse_key(ctx, set, &elem.key.val,
-				    nla[NFTA_SET_ELEM_KEY]);
-	if (err < 0)
-		return err;
+	if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+		return -EINVAL;
+
+	if (nla[NFTA_SET_ELEM_KEY]) {
+		err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+					    nla[NFTA_SET_ELEM_KEY]);
+		if (err < 0)
+			return err;
+	}
 
 	if (nla[NFTA_SET_ELEM_KEY_END]) {
 		err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
@@ -5048,11 +5162,9 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			return err;
 	}
 
-	priv = set->ops->get(ctx->net, set, &elem, flags);
-	if (IS_ERR(priv))
-		return PTR_ERR(priv);
-
-	elem.priv = priv;
+	err = nft_setelem_get(ctx, set, &elem, flags);
+	if (err < 0)
+		return err;
 
 	err = -ENOMEM;
 	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
@@ -5212,7 +5324,8 @@ void *nft_set_elem_init(const struct nft_set *set,
 	ext = nft_set_elem_ext(set, elem);
 	nft_set_ext_init(ext, tmpl);
 
-	memcpy(nft_set_ext_key(ext), key, set->klen);
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY))
+		memcpy(nft_set_ext_key(ext), key, set->klen);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
 		memcpy(nft_set_ext_key_end(ext), key_end, set->klen);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
@@ -5343,6 +5456,192 @@ err_elem_expr_setup:
 	return -ENOMEM;
 }
 
+struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
+					    const struct nft_set *set)
+{
+	struct nft_set_elem_catchall *catchall;
+	u8 genmask = nft_genmask_cur(net);
+	struct nft_set_ext *ext;
+
+	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+		if (nft_set_elem_active(ext, genmask) &&
+		    !nft_set_elem_expired(ext))
+			return ext;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);
+
+void *nft_set_catchall_gc(const struct nft_set *set)
+{
+	struct nft_set_elem_catchall *catchall, *next;
+	struct nft_set_ext *ext;
+	void *elem = NULL;
+
+	list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+
+		if (!nft_set_elem_expired(ext) ||
+		    nft_set_elem_mark_busy(ext))
+			continue;
+
+		elem = catchall->elem;
+		list_del_rcu(&catchall->list);
+		kfree_rcu(catchall, rcu);
+		break;
+	}
+
+	return elem;
+}
+EXPORT_SYMBOL_GPL(nft_set_catchall_gc);
+
+static int nft_setelem_catchall_insert(const struct net *net,
+				       struct nft_set *set,
+				       const struct nft_set_elem *elem,
+				       struct nft_set_ext **pext)
+{
+	struct nft_set_elem_catchall *catchall;
+	u8 genmask = nft_genmask_next(net);
+	struct nft_set_ext *ext;
+
+	list_for_each_entry(catchall, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+		if (nft_set_elem_active(ext, genmask)) {
+			*pext = ext;
+			return -EEXIST;
+		}
+	}
+
+	catchall = kmalloc(sizeof(*catchall), GFP_KERNEL);
+	if (!catchall)
+		return -ENOMEM;
+
+	catchall->elem = elem->priv;
+	list_add_tail_rcu(&catchall->list, &set->catchall_list);
+
+	return 0;
+}
+
+static int nft_setelem_insert(const struct net *net,
+			      struct nft_set *set,
+			      const struct nft_set_elem *elem,
+			      struct nft_set_ext **ext, unsigned int flags)
+{
+	int ret;
+
+	if (flags & NFT_SET_ELEM_CATCHALL)
+		ret = nft_setelem_catchall_insert(net, set, elem, ext);
+	else
+		ret = set->ops->insert(net, set, elem, ext);
+
+	return ret;
+}
+
+static bool nft_setelem_is_catchall(const struct nft_set *set,
+				    const struct nft_set_elem *elem)
+{
+	struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL)
+		return true;
+
+	return false;
+}
+
+static void nft_setelem_activate(struct net *net, struct nft_set *set,
+				 struct nft_set_elem *elem)
+{
+	struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+
+	if (nft_setelem_is_catchall(set, elem)) {
+		nft_set_elem_change_active(net, set, ext);
+		nft_set_elem_clear_busy(ext);
+	} else {
+		set->ops->activate(net, set, elem);
+	}
+}
+
+static int nft_setelem_catchall_deactivate(const struct net *net,
+					   struct nft_set *set,
+					   struct nft_set_elem *elem)
+{
+	struct nft_set_elem_catchall *catchall;
+	struct nft_set_ext *ext;
+
+	list_for_each_entry(catchall, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+		if (!nft_is_active(net, ext) ||
+		    nft_set_elem_mark_busy(ext))
+			continue;
+
+		kfree(elem->priv);
+		elem->priv = catchall->elem;
+		nft_set_elem_change_active(net, set, ext);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int __nft_setelem_deactivate(const struct net *net,
+				    struct nft_set *set,
+				    struct nft_set_elem *elem)
+{
+	void *priv;
+
+	priv = set->ops->deactivate(net, set, elem);
+	if (!priv)
+		return -ENOENT;
+
+	kfree(elem->priv);
+	elem->priv = priv;
+	set->ndeact++;
+
+	return 0;
+}
+
+static int nft_setelem_deactivate(const struct net *net,
+				  struct nft_set *set,
+				  struct nft_set_elem *elem, u32 flags)
+{
+	int ret;
+
+	if (flags & NFT_SET_ELEM_CATCHALL)
+		ret = nft_setelem_catchall_deactivate(net, set, elem);
+	else
+		ret = __nft_setelem_deactivate(net, set, elem);
+
+	return ret;
+}
+
+static void nft_setelem_catchall_remove(const struct net *net,
+					const struct nft_set *set,
+					const struct nft_set_elem *elem)
+{
+	struct nft_set_elem_catchall *catchall, *next;
+
+	list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+		if (catchall->elem == elem->priv) {
+			list_del_rcu(&catchall->list);
+			kfree_rcu(catchall);
+			break;
+		}
+	}
+}
+
+static void nft_setelem_remove(const struct net *net,
+			       const struct nft_set *set,
+			       const struct nft_set_elem *elem)
+{
+	if (nft_setelem_is_catchall(set, elem))
+		nft_setelem_catchall_remove(net, set, elem);
+	else
+		set->ops->remove(net, set, elem);
+}
+
 static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			    const struct nlattr *attr, u32 nlmsg_flags)
 {
@@ -5369,14 +5668,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	if (err < 0)
 		return err;
 
-	if (nla[NFTA_SET_ELEM_KEY] == NULL)
-		return -EINVAL;
-
 	nft_set_ext_prepare(&tmpl);
 
 	err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
 	if (err < 0)
 		return err;
+
+	if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+		return -EINVAL;
+
 	if (flags != 0)
 		nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
 
@@ -5481,12 +5781,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		num_exprs = set->num_exprs;
 	}
 
-	err = nft_setelem_parse_key(ctx, set, &elem.key.val,
-				    nla[NFTA_SET_ELEM_KEY]);
-	if (err < 0)
-		goto err_set_elem_expr;
+	if (nla[NFTA_SET_ELEM_KEY]) {
+		err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+					    nla[NFTA_SET_ELEM_KEY]);
+		if (err < 0)
+			goto err_set_elem_expr;
 
-	nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+	}
 
 	if (nla[NFTA_SET_ELEM_KEY_END]) {
 		err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
@@ -5603,7 +5905,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	}
 
 	ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
-	err = set->ops->insert(ctx->net, set, &elem, &ext2);
+
+	err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags);
 	if (err) {
 		if (err == -EEXIST) {
 			if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
@@ -5630,7 +5933,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		goto err_element_clash;
 	}
 
-	if (set->size &&
+	if (!(flags & NFT_SET_ELEM_CATCHALL) && set->size &&
 	    !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) {
 		err = -ENFILE;
 		goto err_set_full;
@@ -5641,7 +5944,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	return 0;
 
 err_set_full:
-	set->ops->remove(ctx->net, set, &elem);
+	nft_setelem_remove(ctx->net, set, &elem);
 err_element_clash:
 	kfree(trans);
 err_elem_expr:
@@ -5773,7 +6076,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 	struct nft_set_ext *ext;
 	struct nft_trans *trans;
 	u32 flags = 0;
-	void *priv;
 	int err;
 
 	err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
@@ -5781,23 +6083,26 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 	if (err < 0)
 		return err;
 
-	if (nla[NFTA_SET_ELEM_KEY] == NULL)
+	err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
+	if (err < 0)
+		return err;
+
+	if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
 		return -EINVAL;
 
 	nft_set_ext_prepare(&tmpl);
 
-	err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
-	if (err < 0)
-		return err;
 	if (flags != 0)
 		nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
 
-	err = nft_setelem_parse_key(ctx, set, &elem.key.val,
-				    nla[NFTA_SET_ELEM_KEY]);
-	if (err < 0)
-		return err;
+	if (nla[NFTA_SET_ELEM_KEY]) {
+		err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+					    nla[NFTA_SET_ELEM_KEY]);
+		if (err < 0)
+			return err;
 
-	nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+	}
 
 	if (nla[NFTA_SET_ELEM_KEY_END]) {
 		err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
@@ -5823,13 +6128,9 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 	if (trans == NULL)
 		goto fail_trans;
 
-	priv = set->ops->deactivate(ctx->net, set, &elem);
-	if (priv == NULL) {
-		err = -ENOENT;
+	err = nft_setelem_deactivate(ctx->net, set, &elem, flags);
+	if (err < 0)
 		goto fail_ops;
-	}
-	kfree(elem.priv);
-	elem.priv = priv;
 
 	nft_setelem_data_deactivate(ctx->net, set, &elem);
 
@@ -5876,6 +6177,49 @@ err1:
 	return err;
 }
 
+static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
+				    struct nft_set *set,
+				    struct nft_set_elem *elem)
+{
+	struct nft_trans *trans;
+
+	trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
+				    sizeof(struct nft_trans_elem), GFP_KERNEL);
+	if (!trans)
+		return -ENOMEM;
+
+	nft_setelem_data_deactivate(ctx->net, set, elem);
+	nft_trans_elem_set(trans) = set;
+	nft_trans_elem(trans) = *elem;
+	nft_trans_commit_list_add_tail(ctx->net, trans);
+
+	return 0;
+}
+
+static int nft_set_catchall_flush(const struct nft_ctx *ctx,
+				  struct nft_set *set)
+{
+	u8 genmask = nft_genmask_next(ctx->net);
+	struct nft_set_elem_catchall *catchall;
+	struct nft_set_elem elem;
+	struct nft_set_ext *ext;
+	int ret = 0;
+
+	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+		if (!nft_set_elem_active(ext, genmask) ||
+		    nft_set_elem_mark_busy(ext))
+			continue;
+
+		elem.priv = catchall->elem;
+		ret = __nft_set_catchall_flush(ctx, set, &elem);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
 static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
 {
 	struct nft_set_iter iter = {
@@ -5884,6 +6228,8 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
 	};
 
 	set->ops->walk(ctx, set, &iter);
+	if (!iter.err)
+		iter.err = nft_set_catchall_flush(ctx, set);
 
 	return iter.err;
 }
@@ -5918,8 +6264,6 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
 		err = nft_del_setelem(&ctx, set, attr);
 		if (err < 0)
 			break;
-
-		set->ndeact++;
 	}
 	return err;
 }
@@ -8270,7 +8614,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWSETELEM:
 			te = (struct nft_trans_elem *)trans->data;
 
-			te->set->ops->activate(net, te->set, &te->elem);
+			nft_setelem_activate(net, te->set, &te->elem);
 			nf_tables_setelem_notify(&trans->ctx, te->set,
 						 &te->elem,
 						 NFT_MSG_NEWSETELEM, 0);
@@ -8282,9 +8626,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nf_tables_setelem_notify(&trans->ctx, te->set,
 						 &te->elem,
 						 NFT_MSG_DELSETELEM, 0);
-			te->set->ops->remove(net, te->set, &te->elem);
-			atomic_dec(&te->set->nelems);
-			te->set->ndeact--;
+			nft_setelem_remove(net, te->set, &te->elem);
+			if (!nft_setelem_is_catchall(te->set, &te->elem)) {
+				atomic_dec(&te->set->nelems);
+				te->set->ndeact--;
+			}
 			break;
 		case NFT_MSG_NEWOBJ:
 			if (nft_trans_obj_update(trans)) {
@@ -8485,15 +8831,17 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 				break;
 			}
 			te = (struct nft_trans_elem *)trans->data;
-			te->set->ops->remove(net, te->set, &te->elem);
-			atomic_dec(&te->set->nelems);
+			nft_setelem_remove(net, te->set, &te->elem);
+			if (!nft_setelem_is_catchall(te->set, &te->elem))
+				atomic_dec(&te->set->nelems);
 			break;
 		case NFT_MSG_DELSETELEM:
 			te = (struct nft_trans_elem *)trans->data;
 
 			nft_setelem_data_activate(net, te->set, &te->elem);
-			te->set->ops->activate(net, te->set, &te->elem);
-			te->set->ndeact--;
+			nft_setelem_activate(net, te->set, &te->elem);
+			if (!nft_setelem_is_catchall(te->set, &te->elem))
+				te->set->ndeact--;
 
 			nft_trans_destroy(trans);
 			break;
@@ -8672,6 +9020,27 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
 	return nft_check_loops(ctx, ext);
 }
 
+static int nft_set_catchall_loops(const struct nft_ctx *ctx,
+				  struct nft_set *set)
+{
+	u8 genmask = nft_genmask_next(ctx->net);
+	struct nft_set_elem_catchall *catchall;
+	struct nft_set_ext *ext;
+	int ret = 0;
+
+	list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+		ext = nft_set_elem_ext(set, catchall->elem);
+		if (!nft_set_elem_active(ext, genmask))
+			continue;
+
+		ret = nft_check_loops(ctx, ext);
+		if (ret < 0)
+			return ret;
+	}
+
+	return ret;
+}
+
 static int nf_tables_check_loops(const struct nft_ctx *ctx,
 				 const struct nft_chain *chain)
 {
@@ -8731,6 +9100,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 			iter.fn		= nf_tables_loop_check_setelem;
 
 			set->ops->walk(ctx, set, &iter);
+			if (!iter.err)
+				iter.err = nft_set_catchall_loops(ctx, set);
+
 			if (iter.err < 0)
 				return iter.err;
 		}
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index b0f558b4fea5..a479f8a1270c 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -30,13 +30,17 @@ void nft_lookup_eval(const struct nft_expr *expr,
 	const struct nft_lookup *priv = nft_expr_priv(expr);
 	const struct nft_set *set = priv->set;
 	const struct nft_set_ext *ext = NULL;
+	const struct net *net = nft_net(pkt);
 	bool found;
 
-	found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
-				 &ext) ^ priv->invert;
+	found = set->ops->lookup(net, set, &regs->data[priv->sreg], &ext) ^
+				 priv->invert;
 	if (!found) {
-		regs->verdict.code = NFT_BREAK;
-		return;
+		ext = nft_set_catchall_lookup(net, set);
+		if (!ext) {
+			regs->verdict.code = NFT_BREAK;
+			return;
+		}
 	}
 
 	if (ext) {
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index bc104d36d3bb..7e47edee88ee 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -105,15 +105,18 @@ static void nft_objref_map_eval(const struct nft_expr *expr,
 {
 	struct nft_objref_map *priv = nft_expr_priv(expr);
 	const struct nft_set *set = priv->set;
+	struct net *net = nft_net(pkt);
 	const struct nft_set_ext *ext;
 	struct nft_object *obj;
 	bool found;
 
-	found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
-				 &ext);
+	found = set->ops->lookup(net, set, &regs->data[priv->sreg], &ext);
 	if (!found) {
-		regs->verdict.code = NFT_BREAK;
-		return;
+		ext = nft_set_catchall_lookup(net, set);
+		if (!ext) {
+			regs->verdict.code = NFT_BREAK;
+			return;
+		}
 	}
 	obj = *nft_set_ext_obj(ext);
 	obj->ops->eval(obj, regs, pkt);
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index bf618b7ec1ae..58f576abcd4a 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -350,6 +350,12 @@ needs_gc_run:
 	rhashtable_walk_stop(&hti);
 	rhashtable_walk_exit(&hti);
 
+	he = nft_set_catchall_gc(set);
+	if (he) {
+		gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
+		if (gcb)
+			nft_set_gc_batch_add(gcb, he);
+	}
 	nft_set_gc_batch_complete(gcb);
 	queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
 			   nft_set_gc_interval(set));
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 9944523f5c2c..528a2d7ca991 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1529,11 +1529,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
 	int rules_f0, first_rule = 0;
+	struct nft_pipapo_elem *e;
 
 	while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
 		union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
 		struct nft_pipapo_field *f;
-		struct nft_pipapo_elem *e;
 		int i, start, rules_fx;
 
 		start = first_rule;
@@ -1569,6 +1569,10 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
 		}
 	}
 
+	e = nft_set_catchall_gc(set);
+	if (e)
+		nft_set_elem_destroy(set, e, true);
+
 	priv->last_gc = jiffies;
 }
 
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 217ab3644c25..9e36eb4a7429 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -541,6 +541,12 @@ static void nft_rbtree_gc(struct work_struct *work)
 	write_seqcount_end(&priv->count);
 	write_unlock_bh(&priv->lock);
 
+	rbe = nft_set_catchall_gc(set);
+	if (rbe) {
+		gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
+		if (gcb)
+			nft_set_gc_batch_add(gcb, rbe);
+	}
 	nft_set_gc_batch_complete(gcb);
 
 	queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
-- 
cgit v1.2.3-71-gd317


From e542d29ca81d005651680a0a697b72ca13ddc4cc Mon Sep 17 00:00:00 2001
From: Andreas Roeseler <andreas.a.roeseler@gmail.com>
Date: Tue, 27 Apr 2021 10:36:35 -0500
Subject: icmp: standardize naming of RFC 8335 PROBE constants

The current definitions of constants for PROBE, currently defined only
in the net-next kernel branch, are inconsistent, with
some beginning with ICMP and others with simply EXT. This patch
attempts to standardize the naming conventions of the constants for
PROBE before their release into a stable Kernel, and to update the
relevant definitions in net/ipv4/icmp.c.

Similarly, the definitions for the code field (previously
ICMP_EXT_MAL_QUERY, etc) use the same prefixes as the type field. This
patch adds _CODE_ to the prefix to clarify the distinction of these
constants.

Signed-off-by: Andreas Roeseler <andreas.a.roeseler@gmail.com>
Acked-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20210427153635.2591-1-andreas.a.roeseler@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/icmp.h | 28 ++++++++++++++--------------
 net/ipv4/icmp.c           | 16 ++++++++--------
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h
index 222325d1d80e..c1da8244c5e1 100644
--- a/include/uapi/linux/icmp.h
+++ b/include/uapi/linux/icmp.h
@@ -70,22 +70,22 @@
 #define ICMP_EXC_FRAGTIME	1	/* Fragment Reass time exceeded	*/
 
 /* Codes for EXT_ECHO (PROBE) */
-#define ICMP_EXT_ECHO		42
-#define ICMP_EXT_ECHOREPLY	43
-#define ICMP_EXT_MAL_QUERY	1	/* Malformed Query */
-#define ICMP_EXT_NO_IF		2	/* No such Interface */
-#define ICMP_EXT_NO_TABLE_ENT	3	/* No such Table Entry */
-#define ICMP_EXT_MULT_IFS	4	/* Multiple Interfaces Satisfy Query */
+#define ICMP_EXT_ECHO			42
+#define ICMP_EXT_ECHOREPLY		43
+#define ICMP_EXT_CODE_MAL_QUERY		1	/* Malformed Query */
+#define ICMP_EXT_CODE_NO_IF		2	/* No such Interface */
+#define ICMP_EXT_CODE_NO_TABLE_ENT	3	/* No such Table Entry */
+#define ICMP_EXT_CODE_MULT_IFS		4	/* Multiple Interfaces Satisfy Query */
 
 /* Constants for EXT_ECHO (PROBE) */
-#define EXT_ECHOREPLY_ACTIVE	(1 << 2)/* active bit in reply message */
-#define EXT_ECHOREPLY_IPV4	(1 << 1)/* ipv4 bit in reply message */
-#define EXT_ECHOREPLY_IPV6	1	/* ipv6 bit in reply message */
-#define EXT_ECHO_CTYPE_NAME	1
-#define EXT_ECHO_CTYPE_INDEX	2
-#define EXT_ECHO_CTYPE_ADDR	3
-#define ICMP_AFI_IP		1	/* Address Family Identifier for ipv4 */
-#define ICMP_AFI_IP6		2	/* Address Family Identifier for ipv6 */
+#define ICMP_EXT_ECHOREPLY_ACTIVE	(1 << 2)/* active bit in reply message */
+#define ICMP_EXT_ECHOREPLY_IPV4		(1 << 1)/* ipv4 bit in reply message */
+#define ICMP_EXT_ECHOREPLY_IPV6		1	/* ipv6 bit in reply message */
+#define ICMP_EXT_ECHO_CTYPE_NAME	1
+#define ICMP_EXT_ECHO_CTYPE_INDEX	2
+#define ICMP_EXT_ECHO_CTYPE_ADDR	3
+#define ICMP_AFI_IP			1	/* Address Family Identifier for ipv4 */
+#define ICMP_AFI_IP6			2	/* Address Family Identifier for ipv6 */
 
 struct icmphdr {
   __u8		type;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 8bd988fbcb31..7b6931a4d775 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1033,7 +1033,7 @@ static bool icmp_echo(struct sk_buff *skb)
 	status = 0;
 	dev = NULL;
 	switch (iio->extobj_hdr.class_type) {
-	case EXT_ECHO_CTYPE_NAME:
+	case ICMP_EXT_ECHO_CTYPE_NAME:
 		iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio);
 		if (ident_len >= IFNAMSIZ)
 			goto send_mal_query;
@@ -1041,14 +1041,14 @@ static bool icmp_echo(struct sk_buff *skb)
 		memcpy(buff, &iio->ident.name, ident_len);
 		dev = dev_get_by_name(net, buff);
 		break;
-	case EXT_ECHO_CTYPE_INDEX:
+	case ICMP_EXT_ECHO_CTYPE_INDEX:
 		iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) +
 					 sizeof(iio->ident.ifindex), &_iio);
 		if (ident_len != sizeof(iio->ident.ifindex))
 			goto send_mal_query;
 		dev = dev_get_by_index(net, ntohl(iio->ident.ifindex));
 		break;
-	case EXT_ECHO_CTYPE_ADDR:
+	case ICMP_EXT_ECHO_CTYPE_ADDR:
 		if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) +
 				 iio->ident.addr.ctype3_hdr.addrlen)
 			goto send_mal_query;
@@ -1080,23 +1080,23 @@ static bool icmp_echo(struct sk_buff *skb)
 		goto send_mal_query;
 	}
 	if (!dev) {
-		icmp_param.data.icmph.code = ICMP_EXT_NO_IF;
+		icmp_param.data.icmph.code = ICMP_EXT_CODE_NO_IF;
 		goto send_reply;
 	}
 	/* Fill bits in reply message */
 	if (dev->flags & IFF_UP)
-		status |= EXT_ECHOREPLY_ACTIVE;
+		status |= ICMP_EXT_ECHOREPLY_ACTIVE;
 	if (__in_dev_get_rcu(dev) && __in_dev_get_rcu(dev)->ifa_list)
-		status |= EXT_ECHOREPLY_IPV4;
+		status |= ICMP_EXT_ECHOREPLY_IPV4;
 	if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list))
-		status |= EXT_ECHOREPLY_IPV6;
+		status |= ICMP_EXT_ECHOREPLY_IPV6;
 	dev_put(dev);
 	icmp_param.data.icmph.un.echo.sequence |= htons(status);
 send_reply:
 	icmp_reply(&icmp_param, skb);
 		return true;
 send_mal_query:
-	icmp_param.data.icmph.code = ICMP_EXT_MAL_QUERY;
+	icmp_param.data.icmph.code = ICMP_EXT_CODE_MAL_QUERY;
 	goto send_reply;
 }
 
-- 
cgit v1.2.3-71-gd317


From 94604548aa7163fa14b837149bb0cb708bc613bc Mon Sep 17 00:00:00 2001
From: Andrea Mayer <andrea.mayer@uniroma2.it>
Date: Tue, 27 Apr 2021 17:44:04 +0200
Subject: seg6: add counters support for SRv6 Behaviors

This patch provides counters for SRv6 Behaviors as defined in [1],
section 6. For each SRv6 Behavior instance, counters defined in [1] are:

 - the total number of packets that have been correctly processed;
 - the total amount of traffic in bytes of all packets that have been
   correctly processed;

In addition, this patch introduces a new counter that counts the number of
packets that have NOT been properly processed (i.e. errors) by an SRv6
Behavior instance.

Counters are not only interesting for network monitoring purposes (i.e.
counting the number of packets processed by a given behavior) but they also
provide a simple tool for checking whether a behavior instance is working
as we expect or not.
Counters can be useful for troubleshooting misconfigured SRv6 networks.
Indeed, an SRv6 Behavior can silently drop packets for very different
reasons (i.e. wrong SID configuration, interfaces set with SID addresses,
etc) without any notification/message to the user.

Due to the nature of SRv6 networks, diagnostic tools such as ping and
traceroute may be ineffective: paths used for reaching a given router can
be totally different from the ones followed by probe packets. In addition,
paths are often asymmetrical and this makes it even more difficult to keep
up with the journey of the packets and to understand which behaviors are
actually processing our traffic.

When counters are enabled on an SRv6 Behavior instance, it is possible to
verify if packets are actually processed by such behavior and what is the
outcome of the processing. Therefore, the counters for SRv6 Behaviors offer
an non-invasive observability point which can be leveraged for both traffic
monitoring and troubleshooting purposes.

[1] https://www.rfc-editor.org/rfc/rfc8986.html#name-counters

Troubleshooting using SRv6 Behavior counters
--------------------------------------------

Let's make a brief example to see how helpful counters can be for SRv6
networks. Let's consider a node where an SRv6 End Behavior receives an SRv6
packet whose Segment Left (SL) is equal to 0. In this case, the End
Behavior (which accepts only packets with SL >= 1) discards the packet and
increases the error counter.
This information can be leveraged by the network operator for
troubleshooting. Indeed, the error counter is telling the user that the
packet:

  (i) arrived at the node;
 (ii) the packet has been taken into account by the SRv6 End behavior;
(iii) but an error has occurred during the processing.

The error (iii) could be caused by different reasons, such as wrong route
settings on the node or due to an invalid SID List carried by the SRv6
packet. Anyway, the error counter is used to exclude that the packet did
not arrive at the node or it has not been processed by the behavior at
all.

Turning on/off counters for SRv6 Behaviors
------------------------------------------

Each SRv6 Behavior instance can be configured, at the time of its creation,
to make use of counters.
This is done through iproute2 which allows the user to create an SRv6
Behavior instance specifying the optional "count" attribute as shown in the
following example:

 $ ip -6 route add 2001:db8::1 encap seg6local action End count dev eth0

per-behavior counters can be shown by adding "-s" to the iproute2 command
line, i.e.:

 $ ip -s -6 route show 2001:db8::1
 2001:db8::1 encap seg6local action End packets 0 bytes 0 errors 0 dev eth0

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Impact of counters for SRv6 Behaviors on performance
====================================================

To determine the performance impact due to the introduction of counters in
the SRv6 Behavior subsystem, we have carried out extensive tests.

We chose to test the throughput achieved by the SRv6 End.DX2 Behavior
because, among all the other behaviors implemented so far, it reaches the
highest throughput which is around 1.5 Mpps (per core at 2.4 GHz on a
Xeon(R) CPU E5-2630 v3) on kernel 5.12-rc2 using packets of size ~ 100
bytes.

Three different tests were conducted in order to evaluate the overall
throughput of the SRv6 End.DX2 Behavior in the following scenarios:

 1) vanilla kernel (without the SRv6 Behavior counters patch) and a single
    instance of an SRv6 End.DX2 Behavior;
 2) patched kernel with SRv6 Behavior counters and a single instance of
    an SRv6 End.DX2 Behavior with counters turned off;
 3) patched kernel with SRv6 Behavior counters and a single instance of
    SRv6 End.DX2 Behavior with counters turned on.

All tests were performed on a testbed deployed on the CloudLab facilities
[2], a flexible infrastructure dedicated to scientific research on the
future of Cloud Computing.

Results of tests are shown in the following table:

Scenario (1): average 1504764,81 pps (~1504,76 kpps); std. dev 3956,82 pps
Scenario (2): average 1501469,78 pps (~1501,47 kpps); std. dev 2979,85 pps
Scenario (3): average 1501315,13 pps (~1501,32 kpps); std. dev 2956,00 pps

As can be observed, throughputs achieved in scenarios (2),(3) did not
suffer any observable degradation compared to scenario (1).

Thanks to Jakub Kicinski and David Ahern for their valuable suggestions
and comments provided during the discussion of the proposed RFCs.

[2] https://www.cloudlab.us

Signed-off-by: Andrea Mayer <andrea.mayer@uniroma2.it>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/seg6_local.h |  30 ++++++
 net/ipv6/seg6_local.c           | 198 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 226 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h
index 3b39ef1dbb46..5ae3ace84de0 100644
--- a/include/uapi/linux/seg6_local.h
+++ b/include/uapi/linux/seg6_local.h
@@ -27,6 +27,7 @@ enum {
 	SEG6_LOCAL_OIF,
 	SEG6_LOCAL_BPF,
 	SEG6_LOCAL_VRFTABLE,
+	SEG6_LOCAL_COUNTERS,
 	__SEG6_LOCAL_MAX,
 };
 #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
@@ -78,4 +79,33 @@ enum {
 
 #define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1)
 
+/* SRv6 Behavior counters are encoded as netlink attributes guaranteeing the
+ * correct alignment.
+ * Each counter is identified by a different attribute type (i.e.
+ * SEG6_LOCAL_CNT_PACKETS).
+ *
+ * - SEG6_LOCAL_CNT_PACKETS: identifies a counter that counts the number of
+ *   packets that have been CORRECTLY processed by an SRv6 Behavior instance
+ *   (i.e., packets that generate errors or are dropped are NOT counted).
+ *
+ * - SEG6_LOCAL_CNT_BYTES: identifies a counter that counts the total amount
+ *   of traffic in bytes of all packets that have been CORRECTLY processed by
+ *   an SRv6 Behavior instance (i.e., packets that generate errors or are
+ *   dropped are NOT counted).
+ *
+ * - SEG6_LOCAL_CNT_ERRORS: identifies a counter that counts the number of
+ *   packets that have NOT been properly processed by an SRv6 Behavior instance
+ *   (i.e., packets that generate errors or are dropped).
+ */
+enum {
+	SEG6_LOCAL_CNT_UNSPEC,
+	SEG6_LOCAL_CNT_PAD,		/* pad for 64 bits values */
+	SEG6_LOCAL_CNT_PACKETS,
+	SEG6_LOCAL_CNT_BYTES,
+	SEG6_LOCAL_CNT_ERRORS,
+	__SEG6_LOCAL_CNT_MAX,
+};
+
+#define SEG6_LOCAL_CNT_MAX (__SEG6_LOCAL_CNT_MAX - 1)
+
 #endif
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index bd7140885e60..4ff38cb08f4b 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -93,6 +93,35 @@ struct seg6_end_dt_info {
 	int hdrlen;
 };
 
+struct pcpu_seg6_local_counters {
+	u64_stats_t packets;
+	u64_stats_t bytes;
+	u64_stats_t errors;
+
+	struct u64_stats_sync syncp;
+};
+
+/* This struct groups all the SRv6 Behavior counters supported so far.
+ *
+ * put_nla_counters() makes use of this data structure to collect all counter
+ * values after the per-CPU counter evaluation has been performed.
+ * Finally, each counter value (in seg6_local_counters) is stored in the
+ * corresponding netlink attribute and sent to user space.
+ *
+ * NB: we don't want to expose this structure to user space!
+ */
+struct seg6_local_counters {
+	__u64 packets;
+	__u64 bytes;
+	__u64 errors;
+};
+
+#define seg6_local_alloc_pcpu_counters(__gfp)				\
+	__netdev_alloc_pcpu_stats(struct pcpu_seg6_local_counters,	\
+				  ((__gfp) | __GFP_ZERO))
+
+#define SEG6_F_LOCAL_COUNTERS	SEG6_F_ATTR(SEG6_LOCAL_COUNTERS)
+
 struct seg6_local_lwt {
 	int action;
 	struct ipv6_sr_hdr *srh;
@@ -105,6 +134,7 @@ struct seg6_local_lwt {
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	struct seg6_end_dt_info dt_info;
 #endif
+	struct pcpu_seg6_local_counters __percpu *pcpu_counters;
 
 	int headroom;
 	struct seg6_action_desc *desc;
@@ -878,36 +908,43 @@ static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
 		.attrs		= 0,
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 		.input		= input_action_end,
 	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_X,
 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_NH6),
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 		.input		= input_action_end_x,
 	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_T,
 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 		.input		= input_action_end_t,
 	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_DX2,
 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_OIF),
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 		.input		= input_action_end_dx2,
 	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_DX6,
 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_NH6),
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 		.input		= input_action_end_dx6,
 	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_DX4,
 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_NH4),
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 		.input		= input_action_end_dx4,
 	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_DT4,
 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 #ifdef CONFIG_NET_L3_MASTER_DEV
 		.input		= input_action_end_dt4,
 		.slwt_ops	= {
@@ -919,30 +956,35 @@ static struct seg6_action_desc seg6_action_table[] = {
 		.action		= SEG6_LOCAL_ACTION_END_DT6,
 #ifdef CONFIG_NET_L3_MASTER_DEV
 		.attrs		= 0,
-		.optattrs	= SEG6_F_ATTR(SEG6_LOCAL_TABLE) |
+		.optattrs	= SEG6_F_LOCAL_COUNTERS		|
+				  SEG6_F_ATTR(SEG6_LOCAL_TABLE) |
 				  SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
 		.slwt_ops	= {
 					.build_state = seg6_end_dt6_build,
 				  },
 #else
 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 #endif
 		.input		= input_action_end_dt6,
 	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_B6,
 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_SRH),
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 		.input		= input_action_end_b6,
 	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_B6_ENCAP,
 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_SRH),
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 		.input		= input_action_end_b6_encap,
 		.static_headroom	= sizeof(struct ipv6hdr),
 	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_BPF,
 		.attrs		= SEG6_F_ATTR(SEG6_LOCAL_BPF),
+		.optattrs	= SEG6_F_LOCAL_COUNTERS,
 		.input		= input_action_end_bpf,
 	},
 
@@ -963,11 +1005,36 @@ static struct seg6_action_desc *__get_action_desc(int action)
 	return NULL;
 }
 
+static bool seg6_lwtunnel_counters_enabled(struct seg6_local_lwt *slwt)
+{
+	return slwt->parsed_optattrs & SEG6_F_LOCAL_COUNTERS;
+}
+
+static void seg6_local_update_counters(struct seg6_local_lwt *slwt,
+				       unsigned int len, int err)
+{
+	struct pcpu_seg6_local_counters *pcounters;
+
+	pcounters = this_cpu_ptr(slwt->pcpu_counters);
+	u64_stats_update_begin(&pcounters->syncp);
+
+	if (likely(!err)) {
+		u64_stats_inc(&pcounters->packets);
+		u64_stats_add(&pcounters->bytes, len);
+	} else {
+		u64_stats_inc(&pcounters->errors);
+	}
+
+	u64_stats_update_end(&pcounters->syncp);
+}
+
 static int seg6_local_input(struct sk_buff *skb)
 {
 	struct dst_entry *orig_dst = skb_dst(skb);
 	struct seg6_action_desc *desc;
 	struct seg6_local_lwt *slwt;
+	unsigned int len = skb->len;
+	int rc;
 
 	if (skb->protocol != htons(ETH_P_IPV6)) {
 		kfree_skb(skb);
@@ -977,7 +1044,14 @@ static int seg6_local_input(struct sk_buff *skb)
 	slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
 	desc = slwt->desc;
 
-	return desc->input(skb, slwt);
+	rc = desc->input(skb, slwt);
+
+	if (!seg6_lwtunnel_counters_enabled(slwt))
+		return rc;
+
+	seg6_local_update_counters(slwt, len, rc);
+
+	return rc;
 }
 
 static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
@@ -992,6 +1066,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
 	[SEG6_LOCAL_IIF]	= { .type = NLA_U32 },
 	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
 	[SEG6_LOCAL_BPF]	= { .type = NLA_NESTED },
+	[SEG6_LOCAL_COUNTERS]	= { .type = NLA_NESTED },
 };
 
 static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -1296,6 +1371,112 @@ static void destroy_attr_bpf(struct seg6_local_lwt *slwt)
 		bpf_prog_put(slwt->bpf.prog);
 }
 
+static const struct
+nla_policy seg6_local_counters_policy[SEG6_LOCAL_CNT_MAX + 1] = {
+	[SEG6_LOCAL_CNT_PACKETS]	= { .type = NLA_U64 },
+	[SEG6_LOCAL_CNT_BYTES]		= { .type = NLA_U64 },
+	[SEG6_LOCAL_CNT_ERRORS]		= { .type = NLA_U64 },
+};
+
+static int parse_nla_counters(struct nlattr **attrs,
+			      struct seg6_local_lwt *slwt)
+{
+	struct pcpu_seg6_local_counters __percpu *pcounters;
+	struct nlattr *tb[SEG6_LOCAL_CNT_MAX + 1];
+	int ret;
+
+	ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_CNT_MAX,
+					  attrs[SEG6_LOCAL_COUNTERS],
+					  seg6_local_counters_policy, NULL);
+	if (ret < 0)
+		return ret;
+
+	/* basic support for SRv6 Behavior counters requires at least:
+	 * packets, bytes and errors.
+	 */
+	if (!tb[SEG6_LOCAL_CNT_PACKETS] || !tb[SEG6_LOCAL_CNT_BYTES] ||
+	    !tb[SEG6_LOCAL_CNT_ERRORS])
+		return -EINVAL;
+
+	/* counters are always zero initialized */
+	pcounters = seg6_local_alloc_pcpu_counters(GFP_KERNEL);
+	if (!pcounters)
+		return -ENOMEM;
+
+	slwt->pcpu_counters = pcounters;
+
+	return 0;
+}
+
+static int seg6_local_fill_nla_counters(struct sk_buff *skb,
+					struct seg6_local_counters *counters)
+{
+	if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_PACKETS, counters->packets,
+			      SEG6_LOCAL_CNT_PAD))
+		return -EMSGSIZE;
+
+	if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_BYTES, counters->bytes,
+			      SEG6_LOCAL_CNT_PAD))
+		return -EMSGSIZE;
+
+	if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_ERRORS, counters->errors,
+			      SEG6_LOCAL_CNT_PAD))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct seg6_local_counters counters = { 0, 0, 0 };
+	struct nlattr *nest;
+	int rc, i;
+
+	nest = nla_nest_start(skb, SEG6_LOCAL_COUNTERS);
+	if (!nest)
+		return -EMSGSIZE;
+
+	for_each_possible_cpu(i) {
+		struct pcpu_seg6_local_counters *pcounters;
+		u64 packets, bytes, errors;
+		unsigned int start;
+
+		pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
+		do {
+			start = u64_stats_fetch_begin_irq(&pcounters->syncp);
+
+			packets = u64_stats_read(&pcounters->packets);
+			bytes = u64_stats_read(&pcounters->bytes);
+			errors = u64_stats_read(&pcounters->errors);
+
+		} while (u64_stats_fetch_retry_irq(&pcounters->syncp, start));
+
+		counters.packets += packets;
+		counters.bytes += bytes;
+		counters.errors += errors;
+	}
+
+	rc = seg6_local_fill_nla_counters(skb, &counters);
+	if (rc < 0) {
+		nla_nest_cancel(skb, nest);
+		return rc;
+	}
+
+	return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_counters(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	/* a and b are equal if both have pcpu_counters set or not */
+	return (!!((unsigned long)a->pcpu_counters)) ^
+		(!!((unsigned long)b->pcpu_counters));
+}
+
+static void destroy_attr_counters(struct seg6_local_lwt *slwt)
+{
+	free_percpu(slwt->pcpu_counters);
+}
+
 struct seg6_action_param {
 	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
 	int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
@@ -1343,6 +1524,10 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
 				    .put = put_nla_vrftable,
 				    .cmp = cmp_nla_vrftable },
 
+	[SEG6_LOCAL_COUNTERS]	= { .parse = parse_nla_counters,
+				    .put = put_nla_counters,
+				    .cmp = cmp_nla_counters,
+				    .destroy = destroy_attr_counters },
 };
 
 /* call the destroy() callback (if available) for each set attribute in
@@ -1645,6 +1830,15 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
 	if (attrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE))
 		nlsize += nla_total_size(4);
 
+	if (attrs & SEG6_F_LOCAL_COUNTERS)
+		nlsize += nla_total_size(0) + /* nest SEG6_LOCAL_COUNTERS */
+			  /* SEG6_LOCAL_CNT_PACKETS */
+			  nla_total_size_64bit(sizeof(__u64)) +
+			  /* SEG6_LOCAL_CNT_BYTES */
+			  nla_total_size_64bit(sizeof(__u64)) +
+			  /* SEG6_LOCAL_CNT_ERRORS */
+			  nla_total_size_64bit(sizeof(__u64));
+
 	return nlsize;
 }
 
-- 
cgit v1.2.3-71-gd317


From c7d13358b6a2f49f81a34aa323a2d0878a0532a2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 30 Apr 2021 14:00:13 +0200
Subject: netfilter: xt_SECMARK: add new revision to fix structure layout

This extension breaks when trying to delete rules, add a new revision to
fix this.

Fixes: 5e6874cdb8de ("[SECMARK]: Add xtables SECMARK target")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_SECMARK.h |  6 +++
 net/netfilter/xt_SECMARK.c                | 88 ++++++++++++++++++++++++-------
 2 files changed, 75 insertions(+), 19 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/xt_SECMARK.h b/include/uapi/linux/netfilter/xt_SECMARK.h
index 1f2a708413f5..beb2cadba8a9 100644
--- a/include/uapi/linux/netfilter/xt_SECMARK.h
+++ b/include/uapi/linux/netfilter/xt_SECMARK.h
@@ -20,4 +20,10 @@ struct xt_secmark_target_info {
 	char secctx[SECMARK_SECCTX_MAX];
 };
 
+struct xt_secmark_target_info_v1 {
+	__u8 mode;
+	char secctx[SECMARK_SECCTX_MAX];
+	__u32 secid;
+};
+
 #endif /*_XT_SECMARK_H_target */
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 75625d13e976..498a0bf6f044 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -24,10 +24,9 @@ MODULE_ALIAS("ip6t_SECMARK");
 static u8 mode;
 
 static unsigned int
-secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+secmark_tg(struct sk_buff *skb, const struct xt_secmark_target_info_v1 *info)
 {
 	u32 secmark = 0;
-	const struct xt_secmark_target_info *info = par->targinfo;
 
 	switch (mode) {
 	case SECMARK_MODE_SEL:
@@ -41,7 +40,7 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static int checkentry_lsm(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info_v1 *info)
 {
 	int err;
 
@@ -73,15 +72,15 @@ static int checkentry_lsm(struct xt_secmark_target_info *info)
 	return 0;
 }
 
-static int secmark_tg_check(const struct xt_tgchk_param *par)
+static int
+secmark_tg_check(const char *table, struct xt_secmark_target_info_v1 *info)
 {
-	struct xt_secmark_target_info *info = par->targinfo;
 	int err;
 
-	if (strcmp(par->table, "mangle") != 0 &&
-	    strcmp(par->table, "security") != 0) {
+	if (strcmp(table, "mangle") != 0 &&
+	    strcmp(table, "security") != 0) {
 		pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n",
-				    par->table);
+				    table);
 		return -EINVAL;
 	}
 
@@ -116,25 +115,76 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
 	}
 }
 
-static struct xt_target secmark_tg_reg __read_mostly = {
-	.name       = "SECMARK",
-	.revision   = 0,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = secmark_tg_check,
-	.destroy    = secmark_tg_destroy,
-	.target     = secmark_tg,
-	.targetsize = sizeof(struct xt_secmark_target_info),
-	.me         = THIS_MODULE,
+static int secmark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+	struct xt_secmark_target_info *info = par->targinfo;
+	struct xt_secmark_target_info_v1 newinfo = {
+		.mode	= info->mode,
+	};
+	int ret;
+
+	memcpy(newinfo.secctx, info->secctx, SECMARK_SECCTX_MAX);
+
+	ret = secmark_tg_check(par->table, &newinfo);
+	info->secid = newinfo.secid;
+
+	return ret;
+}
+
+static unsigned int
+secmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_secmark_target_info *info = par->targinfo;
+	struct xt_secmark_target_info_v1 newinfo = {
+		.secid	= info->secid,
+	};
+
+	return secmark_tg(skb, &newinfo);
+}
+
+static int secmark_tg_check_v1(const struct xt_tgchk_param *par)
+{
+	return secmark_tg_check(par->table, par->targinfo);
+}
+
+static unsigned int
+secmark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	return secmark_tg(skb, par->targinfo);
+}
+
+static struct xt_target secmark_tg_reg[] __read_mostly = {
+	{
+		.name		= "SECMARK",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= secmark_tg_check_v0,
+		.destroy	= secmark_tg_destroy,
+		.target		= secmark_tg_v0,
+		.targetsize	= sizeof(struct xt_secmark_target_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "SECMARK",
+		.revision	= 1,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= secmark_tg_check_v1,
+		.destroy	= secmark_tg_destroy,
+		.target		= secmark_tg_v1,
+		.targetsize	= sizeof(struct xt_secmark_target_info_v1),
+		.usersize	= offsetof(struct xt_secmark_target_info_v1, secid),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init secmark_tg_init(void)
 {
-	return xt_register_target(&secmark_tg_reg);
+	return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
 }
 
 static void __exit secmark_tg_exit(void)
 {
-	xt_unregister_target(&secmark_tg_reg);
+	xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
 }
 
 module_init(secmark_tg_init);
-- 
cgit v1.2.3-71-gd317


From 77b8aeb9da0490357f1f5a2b0d12125e6332c37a Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 4 May 2021 09:52:02 -0600
Subject: vfio/pci: Revert nvlink removal uAPI breakage

Revert the uAPI changes from the below commit with notice that these
regions and capabilities are no longer provided.

Fixes: b392a1989170 ("vfio/pci: remove vfio_pci_nvlink2")
Reported-by: Greg Kurz <groug@kaod.org>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
Tested-by: Greg Kurz <groug@kaod.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Message-Id: <162014341432.3807030.11054087109120670135.stgit@omen>
---
 include/uapi/linux/vfio.h | 46 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 34b1f53a3901..ef33ea002b0b 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -333,10 +333,21 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
 
 /* 10de vendor PCI sub-types */
-/* subtype 1 was VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, don't use */
+/*
+ * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
+ *
+ * Deprecated, region no longer provided
+ */
+#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM	(1)
 
 /* 1014 vendor PCI sub-types */
-/* subtype 1 was VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, don't use */
+/*
+ * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
+ * to do TLB invalidation on a GPU.
+ *
+ * Deprecated, region no longer provided
+ */
+#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
 
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
@@ -630,9 +641,36 @@ struct vfio_device_migration_info {
  */
 #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
 
-/* subtype 4 was VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, don't use */
+/*
+ * Capability with compressed real address (aka SSA - small system address)
+ * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing
+ * and by the userspace to associate a NVLink bridge with a GPU.
+ *
+ * Deprecated, capability no longer provided
+ */
+#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT	4
+
+struct vfio_region_info_cap_nvlink2_ssatgt {
+	struct vfio_info_cap_header header;
+	__u64 tgt;
+};
 
-/* subtype 5 was VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, don't use */
+/*
+ * Capability with an NVLink link speed. The value is read by
+ * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed"
+ * property in the device tree. The value is fixed in the hardware
+ * and failing to provide the correct value results in the link
+ * not working with no indication from the driver why.
+ *
+ * Deprecated, capability no longer provided
+ */
+#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD	5
+
+struct vfio_region_info_cap_nvlink2_lnkspd {
+	struct vfio_info_cap_header header;
+	__u32 link_speed;
+	__u32 __pad;
+};
 
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
-- 
cgit v1.2.3-71-gd317


From 7677f7fd8be76659cd2d0db8ff4093bbb51c20e5 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 4 May 2021 18:35:36 -0700
Subject: userfaultfd: add minor fault registration mode

Patch series "userfaultfd: add minor fault handling", v9.

Overview
========

This series adds a new userfaultfd feature, UFFD_FEATURE_MINOR_HUGETLBFS.
When enabled (via the UFFDIO_API ioctl), this feature means that any
hugetlbfs VMAs registered with UFFDIO_REGISTER_MODE_MISSING will *also*
get events for "minor" faults.  By "minor" fault, I mean the following
situation:

Let there exist two mappings (i.e., VMAs) to the same page(s) (shared
memory).  One of the mappings is registered with userfaultfd (in minor
mode), and the other is not.  Via the non-UFFD mapping, the underlying
pages have already been allocated & filled with some contents.  The UFFD
mapping has not yet been faulted in; when it is touched for the first
time, this results in what I'm calling a "minor" fault.  As a concrete
example, when working with hugetlbfs, we have huge_pte_none(), but
find_lock_page() finds an existing page.

We also add a new ioctl to resolve such faults: UFFDIO_CONTINUE.  The idea
is, userspace resolves the fault by either a) doing nothing if the
contents are already correct, or b) updating the underlying contents using
the second, non-UFFD mapping (via memcpy/memset or similar, or something
fancier like RDMA, or etc...).  In either case, userspace issues
UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are
correct, carry on setting up the mapping".

Use Case
========

Consider the use case of VM live migration (e.g. under QEMU/KVM):

1. While a VM is still running, we copy the contents of its memory to a
   target machine. The pages are populated on the target by writing to the
   non-UFFD mapping, using the setup described above. The VM is still running
   (and therefore its memory is likely changing), so this may be repeated
   several times, until we decide the target is "up to date enough".

2. We pause the VM on the source, and start executing on the target machine.
   During this gap, the VM's user(s) will *see* a pause, so it is desirable to
   minimize this window.

3. Between the last time any page was copied from the source to the target, and
   when the VM was paused, the contents of that page may have changed - and
   therefore the copy we have on the target machine is out of date. Although we
   can keep track of which pages are out of date, for VMs with large amounts of
   memory, it is "slow" to transfer this information to the target machine. We
   want to resume execution before such a transfer would complete.

4. So, the guest begins executing on the target machine. The first time it
   touches its memory (via the UFFD-registered mapping), userspace wants to
   intercept this fault. Userspace checks whether or not the page is up to date,
   and if not, copies the updated page from the source machine, via the non-UFFD
   mapping. Finally, whether a copy was performed or not, userspace issues a
   UFFDIO_CONTINUE ioctl to tell the kernel "I have ensured the page contents
   are correct, carry on setting up the mapping".

We don't have to do all of the final updates on-demand. The userfaultfd manager
can, in the background, also copy over updated pages once it receives the map of
which pages are up-to-date or not.

Interaction with Existing APIs
==============================

Because this is a feature, a registered VMA could potentially receive both
missing and minor faults.  I spent some time thinking through how the
existing API interacts with the new feature:

UFFDIO_CONTINUE cannot be used to resolve non-minor faults, as it does not
allocate a new page.  If UFFDIO_CONTINUE is used on a non-minor fault:

- For non-shared memory or shmem, -EINVAL is returned.
- For hugetlb, -EFAULT is returned.

UFFDIO_COPY and UFFDIO_ZEROPAGE cannot be used to resolve minor faults.
Without modifications, the existing codepath assumes a new page needs to
be allocated.  This is okay, since userspace must have a second
non-UFFD-registered mapping anyway, thus there isn't much reason to want
to use these in any case (just memcpy or memset or similar).

- If UFFDIO_COPY is used on a minor fault, -EEXIST is returned.
- If UFFDIO_ZEROPAGE is used on a minor fault, -EEXIST is returned (or -EINVAL
  in the case of hugetlb, as UFFDIO_ZEROPAGE is unsupported in any case).
- UFFDIO_WRITEPROTECT simply doesn't work with shared memory, and returns
  -ENOENT in that case (regardless of the kind of fault).

Future Work
===========

This series only supports hugetlbfs.  I have a second series in flight to
support shmem as well, extending the functionality.  This series is more
mature than the shmem support at this point, and the functionality works
fully on hugetlbfs, so this series can be merged first and then shmem
support will follow.

This patch (of 6):

This feature allows userspace to intercept "minor" faults.  By "minor"
faults, I mean the following situation:

Let there exist two mappings (i.e., VMAs) to the same page(s).  One of the
mappings is registered with userfaultfd (in minor mode), and the other is
not.  Via the non-UFFD mapping, the underlying pages have already been
allocated & filled with some contents.  The UFFD mapping has not yet been
faulted in; when it is touched for the first time, this results in what
I'm calling a "minor" fault.  As a concrete example, when working with
hugetlbfs, we have huge_pte_none(), but find_lock_page() finds an existing
page.

This commit adds the new registration mode, and sets the relevant flag on
the VMAs being registered.  In the hugetlb fault path, if we find that we
have huge_pte_none(), but find_lock_page() does indeed find an existing
page, then we have a "minor" fault, and if the VMA has the userfaultfd
registration flag, we call into userfaultfd to handle it.

This is implemented as a new registration mode, instead of an API feature.
This is because the alternative implementation has significant drawbacks
[1].

However, doing it this was requires we allocate a VM_* flag for the new
registration mode.  On 32-bit systems, there are no unused bits, so this
feature is only supported on architectures with
CONFIG_ARCH_USES_HIGH_VMA_FLAGS.  When attempting to register a VMA in
MINOR mode on 32-bit architectures, we return -EINVAL.

[1] https://lore.kernel.org/patchwork/patch/1380226/

[peterx@redhat.com: fix minor fault page leak]
  Link: https://lkml.kernel.org/r/20210322175132.36659-1-peterx@redhat.com

Link: https://lkml.kernel.org/r/20210301222728.176417-1-axelrasmussen@google.com
Link: https://lkml.kernel.org/r/20210301222728.176417-2-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig               |  1 +
 arch/x86/Kconfig                 |  1 +
 fs/proc/task_mmu.c               |  3 ++
 fs/userfaultfd.c                 | 78 +++++++++++++++++++++++----------------
 include/linux/mm.h               |  7 ++++
 include/linux/userfaultfd_k.h    | 15 +++++++-
 include/trace/events/mmflags.h   |  7 ++++
 include/uapi/linux/userfaultfd.h | 15 +++++++-
 init/Kconfig                     |  5 +++
 mm/hugetlb.c                     | 80 ++++++++++++++++++++++++++--------------
 10 files changed, 150 insertions(+), 62 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7f2a80091337..04c69f606537 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -213,6 +213,7 @@ config ARM64
 	select SWIOTLB
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
+	select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dac15f646f79..1c350e8782ed 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,7 @@ config X86
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
 	select HAVE_ARCH_USERFAULTFD_WP         if X86_64 && USERFAULTFD
+	select HAVE_ARCH_USERFAULTFD_MINOR	if X86_64 && USERFAULTFD
 	select HAVE_ARCH_VMAP_STACK		if X86_64
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_WITHIN_STACK_FRAMES
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e862cab69583..fc9784544b24 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_PKEY_BIT4)]	= "",
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+		[ilog2(VM_UFFD_MINOR)]	= "ui",
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
 	};
 	size_t i;
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e5ce3b4e6c3d..ba35cafa8b0d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -197,24 +197,21 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 	msg_init(&msg);
 	msg.event = UFFD_EVENT_PAGEFAULT;
 	msg.arg.pagefault.address = address;
+	/*
+	 * These flags indicate why the userfault occurred:
+	 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
+	 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
+	 * - Neither of these flags being set indicates a MISSING fault.
+	 *
+	 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
+	 * fault. Otherwise, it was a read fault.
+	 */
 	if (flags & FAULT_FLAG_WRITE)
-		/*
-		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
-		 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
-		 * was not set in a UFFD_EVENT_PAGEFAULT, it means it
-		 * was a read fault, otherwise if set it means it's
-		 * a write fault.
-		 */
 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
 	if (reason & VM_UFFD_WP)
-		/*
-		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
-		 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
-		 * not set in a UFFD_EVENT_PAGEFAULT, it means it was
-		 * a missing fault, otherwise if set it means it's a
-		 * write protect fault.
-		 */
 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+	if (reason & VM_UFFD_MINOR)
+		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
 	if (features & UFFD_FEATURE_THREAD_ID)
 		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
 	return msg;
@@ -401,8 +398,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
 	BUG_ON(ctx->mm != mm);
 
-	VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
-	VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+	/* Any unrecognized flag is a bug. */
+	VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
+	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
+	VM_BUG_ON(!reason || (reason & (reason - 1)));
 
 	if (ctx->features & UFFD_FEATURE_SIGBUS)
 		goto out;
@@ -612,7 +611,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 		for (vma = mm->mmap; vma; vma = vma->vm_next)
 			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
 				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-				vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+				vma->vm_flags &= ~__VM_UFFD_FLAGS;
 			}
 		mmap_write_unlock(mm);
 
@@ -644,7 +643,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 	octx = vma->vm_userfaultfd_ctx.ctx;
 	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+		vma->vm_flags &= ~__VM_UFFD_FLAGS;
 		return 0;
 	}
 
@@ -726,7 +725,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 	} else {
 		/* Drop uffd context if remap feature not enabled */
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+		vma->vm_flags &= ~__VM_UFFD_FLAGS;
 	}
 }
 
@@ -867,12 +866,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		cond_resched();
 		BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
-		       !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+		       !!(vma->vm_flags & __VM_UFFD_FLAGS));
 		if (vma->vm_userfaultfd_ctx.ctx != ctx) {
 			prev = vma;
 			continue;
 		}
-		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 		prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
 				 new_flags, vma->anon_vma,
 				 vma->vm_file, vma->vm_pgoff,
@@ -1262,9 +1261,19 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 				     unsigned long vm_flags)
 {
 	/* FIXME: add WP support to hugetlbfs and shmem */
-	return vma_is_anonymous(vma) ||
-		((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
-		 !(vm_flags & VM_UFFD_WP));
+	if (vm_flags & VM_UFFD_WP) {
+		if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
+			return false;
+	}
+
+	if (vm_flags & VM_UFFD_MINOR) {
+		/* FIXME: Add minor fault interception for shmem. */
+		if (!is_vm_hugetlb_page(vma))
+			return false;
+	}
+
+	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+	       vma_is_shmem(vma);
 }
 
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1290,14 +1299,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	ret = -EINVAL;
 	if (!uffdio_register.mode)
 		goto out;
-	if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
-				     UFFDIO_REGISTER_MODE_WP))
+	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
 		goto out;
 	vm_flags = 0;
 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
 		vm_flags |= VM_UFFD_MISSING;
 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
 		vm_flags |= VM_UFFD_WP;
+	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+		goto out;
+#endif
+		vm_flags |= VM_UFFD_MINOR;
+	}
 
 	ret = validate_range(mm, &uffdio_register.range.start,
 			     uffdio_register.range.len);
@@ -1341,7 +1355,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		cond_resched();
 
 		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
-		       !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
 
 		/* check not compatible vmas */
 		ret = -EINVAL;
@@ -1421,8 +1435,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 			start = vma->vm_start;
 		vma_end = min(end, vma->vm_end);
 
-		new_flags = (vma->vm_flags &
-			     ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags;
+		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
 				 vma_policy(vma),
@@ -1544,7 +1557,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		cond_resched();
 
 		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
-		       !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
 
 		/*
 		 * Check not compatible vmas, not strictly required
@@ -1595,7 +1608,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
 		}
 
-		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
 				 vma_policy(vma),
@@ -1863,6 +1876,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 		goto err_out;
 	/* report all available features and ioctls to userland */
 	uffdio_api.features = UFFD_API_FEATURES;
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+	uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
+#endif
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 011f43605807..1dbb53c44243 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_GROWSUP	VM_NONE
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+# define VM_UFFD_MINOR_BIT	37
+# define VM_UFFD_MINOR		BIT(VM_UFFD_MINOR_BIT)	/* UFFD minor faults */
+#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+# define VM_UFFD_MINOR		VM_NONE
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index c63ccdae3eab..0390e5ac63b3 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -17,6 +17,9 @@
 #include <linux/mm.h>
 #include <asm-generic/pgtable_uffd.h>
 
+/* The set of all possible UFFD-related VM flags. */
+#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR)
+
 /*
  * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
  * new flags, since they might collide with O_* ones. We want
@@ -71,6 +74,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_UFFD_WP;
 }
 
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_UFFD_MINOR;
+}
+
 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
 				      pte_t pte)
 {
@@ -85,7 +93,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
 
 static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 {
-	return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+	return vma->vm_flags & __VM_UFFD_FLAGS;
 }
 
 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
@@ -132,6 +140,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+	return false;
+}
+
 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
 				      pte_t pte)
 {
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 67018d367b9f..629c7a0eaff2 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2,		"arch_2"	)
 #define IF_HAVE_VM_SOFTDIRTY(flag,name)
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name},
+#else
+# define IF_HAVE_UFFD_MINOR(flag, name)
+#endif
+
 #define __def_vmaflag_names						\
 	{VM_READ,			"read"		},		\
 	{VM_WRITE,			"write"		},		\
@@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2,		"arch_2"	)
 	{VM_MAYSHARE,			"mayshare"	},		\
 	{VM_GROWSDOWN,			"growsdown"	},		\
 	{VM_UFFD_MISSING,		"uffd_missing"	},		\
+IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR,	"uffd_minor"	)		\
 	{VM_PFNMAP,			"pfnmap"	},		\
 	{VM_DENYWRITE,			"denywrite"	},		\
 	{VM_UFFD_WP,			"uffd_wp"	},		\
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 5f2d88212f7c..f24dd4fcbad9 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -19,15 +19,19 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
+#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING |	\
+				 UFFDIO_REGISTER_MODE_WP |	\
+				 UFFDIO_REGISTER_MODE_MINOR)
 #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP |	\
 			   UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
-			   UFFD_FEATURE_EVENT_REMOVE |	\
+			   UFFD_FEATURE_EVENT_REMOVE |		\
 			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM |		\
 			   UFFD_FEATURE_SIGBUS |		\
-			   UFFD_FEATURE_THREAD_ID)
+			   UFFD_FEATURE_THREAD_ID |		\
+			   UFFD_FEATURE_MINOR_HUGETLBFS)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -127,6 +131,7 @@ struct uffd_msg {
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
 #define UFFD_PAGEFAULT_FLAG_WP		(1<<1)	/* If reason is VM_UFFD_WP */
+#define UFFD_PAGEFAULT_FLAG_MINOR	(1<<2)	/* If reason is VM_UFFD_MINOR */
 
 struct uffdio_api {
 	/* userland asks for an API number and the features to enable */
@@ -171,6 +176,10 @@ struct uffdio_api {
 	 *
 	 * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
 	 * be returned, if feature is not requested 0 will be returned.
+	 *
+	 * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults
+	 * can be intercepted (via REGISTER_MODE_MINOR) for
+	 * hugetlbfs-backed pages.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -181,6 +190,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
 #define UFFD_FEATURE_SIGBUS			(1<<7)
 #define UFFD_FEATURE_THREAD_ID			(1<<8)
+#define UFFD_FEATURE_MINOR_HUGETLBFS		(1<<9)
 	__u64 features;
 
 	__u64 ioctls;
@@ -195,6 +205,7 @@ struct uffdio_register {
 	struct uffdio_range range;
 #define UFFDIO_REGISTER_MODE_MISSING	((__u64)1<<0)
 #define UFFDIO_REGISTER_MODE_WP		((__u64)1<<1)
+#define UFFDIO_REGISTER_MODE_MINOR	((__u64)1<<2)
 	__u64 mode;
 
 	/*
diff --git a/init/Kconfig b/init/Kconfig
index 9acb7762e971..1413413fcb9f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1644,6 +1644,11 @@ config HAVE_ARCH_USERFAULTFD_WP
 	help
 	  Arch has userfaultfd write protection support
 
+config HAVE_ARCH_USERFAULTFD_MINOR
+	bool
+	help
+	  Arch has userfaultfd minor fault support
+
 config MEMBARRIER
 	bool "Enable membarrier() system call" if EXPERT
 	default y
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b5977d9709ad..84530876b2ae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4469,6 +4469,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 	return 0;
 }
 
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+						  struct address_space *mapping,
+						  pgoff_t idx,
+						  unsigned int flags,
+						  unsigned long haddr,
+						  unsigned long reason)
+{
+	vm_fault_t ret;
+	u32 hash;
+	struct vm_fault vmf = {
+		.vma = vma,
+		.address = haddr,
+		.flags = flags,
+
+		/*
+		 * Hard to debug if it ends up being
+		 * used by a callee that assumes
+		 * something about the other
+		 * uninitialized fields... same as in
+		 * memory.c
+		 */
+	};
+
+	/*
+	 * hugetlb_fault_mutex and i_mmap_rwsem must be
+	 * dropped before handling userfault.  Reacquire
+	 * after handling fault to make calling code simpler.
+	 */
+	hash = hugetlb_fault_mutex_hash(mapping, idx);
+	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+	i_mmap_unlock_read(mapping);
+	ret = handle_userfault(&vmf, reason);
+	i_mmap_lock_read(mapping);
+	mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+	return ret;
+}
+
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 			struct vm_area_struct *vma,
 			struct address_space *mapping, pgoff_t idx,
@@ -4507,35 +4545,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 retry:
 	page = find_lock_page(mapping, idx);
 	if (!page) {
-		/*
-		 * Check for page in userfault range
-		 */
+		/* Check for page in userfault range */
 		if (userfaultfd_missing(vma)) {
-			u32 hash;
-			struct vm_fault vmf = {
-				.vma = vma,
-				.address = haddr,
-				.flags = flags,
-				/*
-				 * Hard to debug if it ends up being
-				 * used by a callee that assumes
-				 * something about the other
-				 * uninitialized fields... same as in
-				 * memory.c
-				 */
-			};
-
-			/*
-			 * hugetlb_fault_mutex and i_mmap_rwsem must be
-			 * dropped before handling userfault.  Reacquire
-			 * after handling fault to make calling code simpler.
-			 */
-			hash = hugetlb_fault_mutex_hash(mapping, idx);
-			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-			i_mmap_unlock_read(mapping);
-			ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-			i_mmap_lock_read(mapping);
-			mutex_lock(&hugetlb_fault_mutex_table[hash]);
+			ret = hugetlb_handle_userfault(vma, mapping, idx,
+						       flags, haddr,
+						       VM_UFFD_MISSING);
 			goto out;
 		}
 
@@ -4591,6 +4605,16 @@ retry:
 				VM_FAULT_SET_HINDEX(hstate_index(h));
 			goto backout_unlocked;
 		}
+
+		/* Check for page in userfault range. */
+		if (userfaultfd_minor(vma)) {
+			unlock_page(page);
+			put_page(page);
+			ret = hugetlb_handle_userfault(vma, mapping, idx,
+						       flags, haddr,
+						       VM_UFFD_MINOR);
+			goto out;
+		}
 	}
 
 	/*
-- 
cgit v1.2.3-71-gd317


From f619147104c8ea71e120e4936d2b68ec11a1e527 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 4 May 2021 18:35:49 -0700
Subject: userfaultfd: add UFFDIO_CONTINUE ioctl

This ioctl is how userspace ought to resolve "minor" userfaults.  The
idea is, userspace is notified that a minor fault has occurred.  It
might change the contents of the page using its second non-UFFD mapping,
or not.  Then, it calls UFFDIO_CONTINUE to tell the kernel "I have
ensured the page contents are correct, carry on setting up the mapping".

Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for
MINOR registered VMAs.  ZEROPAGE maps the VMA to the zero page; but in
the minor fault case, we already have some pre-existing underlying page.
Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping.
We'd just use memcpy() or similar instead.

It turns out hugetlb_mcopy_atomic_pte() already does very close to what
we want, if an existing page is provided via `struct page **pagep`.  We
already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so
just extend that design: add an enum for the three modes of operation,
and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE
case.  (Basically, look up the existing page, and avoid adding the
existing page to the page cache or calling set_page_huge_active() on
it.)

Link: https://lkml.kernel.org/r/20210301222728.176417-5-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 67 ++++++++++++++++++++++++++++++++++++++++
 include/linux/hugetlb.h          |  3 ++
 include/linux/userfaultfd_k.h    | 18 +++++++++++
 include/uapi/linux/userfaultfd.h | 21 +++++++++++--
 mm/hugetlb.c                     | 40 +++++++++++++++---------
 mm/userfaultfd.c                 | 37 +++++++++++++---------
 6 files changed, 156 insertions(+), 30 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ba35cafa8b0d..14f92285d04f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1487,6 +1487,10 @@ out_unlock:
 		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
 			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
 
+		/* CONTINUE ioctl is only supported for MINOR ranges. */
+		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
+
 		/*
 		 * Now that we scanned all vmas we can already tell
 		 * userland which ioctls methods are guaranteed to
@@ -1840,6 +1844,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+	__s64 ret;
+	struct uffdio_continue uffdio_continue;
+	struct uffdio_continue __user *user_uffdio_continue;
+	struct userfaultfd_wake_range range;
+
+	user_uffdio_continue = (struct uffdio_continue __user *)arg;
+
+	ret = -EAGAIN;
+	if (READ_ONCE(ctx->mmap_changing))
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
+			   /* don't copy the output fields */
+			   sizeof(uffdio_continue) - (sizeof(__s64))))
+		goto out;
+
+	ret = validate_range(ctx->mm, &uffdio_continue.range.start,
+			     uffdio_continue.range.len);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	/* double check for wraparound just in case. */
+	if (uffdio_continue.range.start + uffdio_continue.range.len <=
+	    uffdio_continue.range.start) {
+		goto out;
+	}
+	if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+		goto out;
+
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
+				     uffdio_continue.range.len,
+				     &ctx->mmap_changing);
+		mmput(ctx->mm);
+	} else {
+		return -ESRCH;
+	}
+
+	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
+		return -EFAULT;
+	if (ret < 0)
+		goto out;
+
+	/* len == 0 would wake all */
+	BUG_ON(!ret);
+	range.len = ret;
+	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
+		range.start = uffdio_continue.range.start;
+		wake_userfault(ctx, &range);
+	}
+	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
+
+out:
+	return ret;
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
 	/*
@@ -1927,6 +1991,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_WRITEPROTECT:
 		ret = userfaultfd_writeprotect(ctx, arg);
 		break;
+	case UFFDIO_CONTINUE:
+		ret = userfaultfd_continue(ctx, arg);
+		break;
 	}
 	return ret;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a1dbe4568707..b92f25ccef58 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -11,6 +11,7 @@
 #include <linux/kref.h>
 #include <linux/pgtable.h>
 #include <linux/gfp.h>
+#include <linux/userfaultfd_k.h>
 
 struct ctl_table;
 struct user_struct;
@@ -139,6 +140,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
 				struct vm_area_struct *dst_vma,
 				unsigned long dst_addr,
 				unsigned long src_addr,
+				enum mcopy_atomic_mode mode,
 				struct page **pagep);
 #endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
@@ -318,6 +320,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 						struct vm_area_struct *dst_vma,
 						unsigned long dst_addr,
 						unsigned long src_addr,
+						enum mcopy_atomic_mode mode,
 						struct page **pagep)
 {
 	BUG();
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e060d5f77cc5..794d1538b8ba 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
 
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
+/*
+ * The mode of operation for __mcopy_atomic and its helpers.
+ *
+ * This is almost an implementation detail (mcopy_atomic below doesn't take this
+ * as a parameter), but it's exposed here because memory-kind-specific
+ * implementations (e.g. hugetlbfs) need to know the mode of operation.
+ */
+enum mcopy_atomic_mode {
+	/* A normal copy_from_user into the destination range. */
+	MCOPY_ATOMIC_NORMAL,
+	/* Don't copy; map the destination range to the zero page. */
+	MCOPY_ATOMIC_ZEROPAGE,
+	/* Just install pte(s) with the existing page(s) in the page cache. */
+	MCOPY_ATOMIC_CONTINUE,
+};
+
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 			    unsigned long src_start, unsigned long len,
 			    bool *mmap_changing, __u64 mode);
@@ -44,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
 			      unsigned long dst_start,
 			      unsigned long len,
 			      bool *mmap_changing);
+extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
+			      unsigned long len, bool *mmap_changing);
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, bool *mmap_changing);
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index f24dd4fcbad9..bafbeb1a2624 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -40,10 +40,12 @@
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
 	 (__u64)1 << _UFFDIO_ZEROPAGE |		\
-	 (__u64)1 << _UFFDIO_WRITEPROTECT)
+	 (__u64)1 << _UFFDIO_WRITEPROTECT |	\
+	 (__u64)1 << _UFFDIO_CONTINUE)
 #define UFFD_API_RANGE_IOCTLS_BASIC		\
 	((__u64)1 << _UFFDIO_WAKE |		\
-	 (__u64)1 << _UFFDIO_COPY)
+	 (__u64)1 << _UFFDIO_COPY |		\
+	 (__u64)1 << _UFFDIO_CONTINUE)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
@@ -59,6 +61,7 @@
 #define _UFFDIO_COPY			(0x03)
 #define _UFFDIO_ZEROPAGE		(0x04)
 #define _UFFDIO_WRITEPROTECT		(0x06)
+#define _UFFDIO_CONTINUE		(0x07)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -77,6 +80,8 @@
 				      struct uffdio_zeropage)
 #define UFFDIO_WRITEPROTECT	_IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
 				      struct uffdio_writeprotect)
+#define UFFDIO_CONTINUE		_IOR(UFFDIO, _UFFDIO_CONTINUE,	\
+				     struct uffdio_continue)
 
 /* read() structure */
 struct uffd_msg {
@@ -268,6 +273,18 @@ struct uffdio_writeprotect {
 	__u64 mode;
 };
 
+struct uffdio_continue {
+	struct uffdio_range range;
+#define UFFDIO_CONTINUE_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * Fields below here are written by the ioctl and must be at the end:
+	 * the copy_from_user will not read past here.
+	 */
+	__s64 mapped;
+};
+
 /*
  * Flags for the userfaultfd(2) system call itself.
  */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b105a455124d..533e5a26e437 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -39,7 +39,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/userfaultfd_k.h>
 #include <linux/page_owner.h>
 #include "internal.h"
 
@@ -4865,8 +4864,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    struct vm_area_struct *dst_vma,
 			    unsigned long dst_addr,
 			    unsigned long src_addr,
+			    enum mcopy_atomic_mode mode,
 			    struct page **pagep)
 {
+	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
 	struct address_space *mapping;
 	pgoff_t idx;
 	unsigned long size;
@@ -4876,8 +4877,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	spinlock_t *ptl;
 	int ret;
 	struct page *page;
+	int writable;
 
-	if (!*pagep) {
+	mapping = dst_vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+	if (is_continue) {
+		ret = -EFAULT;
+		page = find_lock_page(mapping, idx);
+		if (!page)
+			goto out;
+	} else if (!*pagep) {
 		ret = -ENOMEM;
 		page = alloc_huge_page(dst_vma, dst_addr, 0);
 		if (IS_ERR(page))
@@ -4906,13 +4916,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	 */
 	__SetPageUptodate(page);
 
-	mapping = dst_vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
-	/*
-	 * If shared, add to page cache
-	 */
-	if (vm_shared) {
+	/* Add shared, newly allocated pages to the page cache. */
+	if (vm_shared && !is_continue) {
 		size = i_size_read(mapping->host) >> huge_page_shift(h);
 		ret = -EFAULT;
 		if (idx >= size)
@@ -4957,8 +4962,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
 	}
 
-	_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
-	if (dst_vma->vm_flags & VM_WRITE)
+	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
+	if (is_continue && !vm_shared)
+		writable = 0;
+	else
+		writable = dst_vma->vm_flags & VM_WRITE;
+
+	_dst_pte = make_huge_pte(dst_vma, page, writable);
+	if (writable)
 		_dst_pte = huge_pte_mkdirty(_dst_pte);
 	_dst_pte = pte_mkyoung(_dst_pte);
 
@@ -4972,15 +4983,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
 
 	spin_unlock(ptl);
-	SetHPageMigratable(page);
-	if (vm_shared)
+	if (!is_continue)
+		SetHPageMigratable(page);
+	if (vm_shared || is_continue)
 		unlock_page(page);
 	ret = 0;
 out:
 	return ret;
 out_release_unlock:
 	spin_unlock(ptl);
-	if (vm_shared)
+	if (vm_shared || is_continue)
 		unlock_page(page);
 out_release_nounlock:
 	put_page(page);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 063cbb17e8d8..e14b3820c6a8 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      bool zeropage)
+					      enum mcopy_atomic_mode mode)
 {
 	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 	 * by THP.  Since we can not reliably insert a zero page, this
 	 * feature is not supported.
 	 */
-	if (zeropage) {
+	if (mode == MCOPY_ATOMIC_ZEROPAGE) {
 		mmap_read_unlock(dst_mm);
 		return -EINVAL;
 	}
@@ -273,8 +273,6 @@ retry:
 	}
 
 	while (src_addr < src_start + len) {
-		pte_t dst_pteval;
-
 		BUG_ON(dst_addr >= dst_start + len);
 
 		/*
@@ -297,16 +295,16 @@ retry:
 			goto out_unlock;
 		}
 
-		err = -EEXIST;
-		dst_pteval = huge_ptep_get(dst_pte);
-		if (!huge_pte_none(dst_pteval)) {
+		if (mode != MCOPY_ATOMIC_CONTINUE &&
+		    !huge_pte_none(huge_ptep_get(dst_pte))) {
+			err = -EEXIST;
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			i_mmap_unlock_read(mapping);
 			goto out_unlock;
 		}
 
 		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
-						dst_addr, src_addr, &page);
+					       dst_addr, src_addr, mode, &page);
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		i_mmap_unlock_read(mapping);
@@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 				      unsigned long dst_start,
 				      unsigned long src_start,
 				      unsigned long len,
-				      bool zeropage);
+				      enum mcopy_atomic_mode mode);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      bool zeropage,
+					      enum mcopy_atomic_mode mcopy_mode,
 					      bool *mmap_changing,
 					      __u64 mode)
 {
@@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	long copied;
 	struct page *page;
 	bool wp_copy;
+	bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
 
 	/*
 	 * Sanitize the command parameters:
@@ -527,10 +526,12 @@ retry:
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
 		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
-						src_start, len, zeropage);
+						src_start, len, mcopy_mode);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
+	if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+		goto out_unlock;
 
 	/*
 	 * Ensure the dst_vma has a anon_vma or this page
@@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 		     unsigned long src_start, unsigned long len,
 		     bool *mmap_changing, __u64 mode)
 {
-	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
-			      mmap_changing, mode);
+	return __mcopy_atomic(dst_mm, dst_start, src_start, len,
+			      MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
 		       unsigned long len, bool *mmap_changing)
 {
-	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
+			      mmap_changing, 0);
+}
+
+ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
+		       unsigned long len, bool *mmap_changing)
+{
+	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
+			      mmap_changing, 0);
 }
 
 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
-- 
cgit v1.2.3-71-gd317


From b6676de8d7b48724d4cd3a3742c62fa525baa904 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 4 May 2021 18:36:01 -0700
Subject: mm/vmscan: move RECLAIM* bits to uapi header

It is currently not obvious that the RECLAIM_* bits are part of the uapi
since they are defined in vmscan.c.  Move them to a uapi header to make it
obvious.

This should have no functional impact.

Link: https://lkml.kernel.org/r/20210219172557.08074910@viggo.jf.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Daniel Wagner <dwagner@suse.de>
Cc: "Tobin C. Harding" <tobin@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/mempolicy.h | 7 +++++++
 mm/vmscan.c                    | 8 --------
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 8948467b3992..4832fd0b5642 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -64,5 +64,12 @@ enum {
 #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
 #define MPOL_F_MORON	(1 << 4) /* Migrate On protnone Reference On Node */
 
+/*
+ * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
+ * ABI.  New bits are OK, but existing bits can never change.
+ */
+#define RECLAIM_ZONE	(1<<0)	/* Run shrink_inactive_list on the zone */
+#define RECLAIM_WRITE	(1<<1)	/* Writeout pages during reclaim */
+#define RECLAIM_UNMAP	(1<<2)	/* Unmap pages during reclaim */
 
 #endif /* _UAPI_LINUX_MEMPOLICY_H */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 42aaef30633e..671143dbf809 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4086,14 +4086,6 @@ module_init(kswapd_init)
  */
 int node_reclaim_mode __read_mostly;
 
-/*
- * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
- * ABI.  New bits are OK, but existing bits can never change.
- */
-#define RECLAIM_ZONE  (1<<0)   /* Run shrink_inactive_list on the zone */
-#define RECLAIM_WRITE (1<<1)   /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<2)   /* Unmap pages during reclaim */
-
 /*
  * Priority for NODE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of
-- 
cgit v1.2.3-71-gd317


From fa60ce2cb4506701c43bd4cf3ca23d970daf1b9c Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 6 May 2021 18:06:44 -0700
Subject: treewide: remove editor modelines and cruft

The section "19) Editor modelines and other cruft" in
Documentation/process/coding-style.rst clearly says, "Do not include any
of these in source files."

I recently receive a patch to explicitly add a new one.

Let's do treewide cleanups, otherwise some people follow the existing code
and attempt to upstream their favoriate editor setups.

It is even nicer if scripts/checkpatch.pl can check it.

If we like to impose coding style in an editor-independent manner, I think
editorconfig (patch [1]) is a saner solution.

[1] https://lore.kernel.org/lkml/20200703073143.423557-1-danny@kdrag0n.dev/

Link: https://lkml.kernel.org/r/20210324054457.1477489-1-masahiroy@kernel.org
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>	[auxdisplay]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/atari/time.c                               |  7 -------
 arch/parisc/include/asm/pdc_chassis.h                |  1 -
 arch/um/drivers/cow.h                                |  7 -------
 drivers/auxdisplay/panel.c                           |  7 -------
 drivers/gpu/drm/qxl/qxl_drv.c                        |  1 -
 drivers/media/usb/pwc/pwc-uncompress.c               |  3 ---
 drivers/net/ethernet/adaptec/starfire.c              |  8 --------
 drivers/net/ethernet/amd/atarilance.c                |  8 --------
 drivers/net/ethernet/amd/pcnet32.c                   |  7 -------
 .../net/wireless/intersil/orinoco/orinoco_nortel.c   |  8 --------
 drivers/net/wireless/intersil/orinoco/orinoco_pci.c  |  8 --------
 drivers/net/wireless/intersil/orinoco/orinoco_plx.c  |  8 --------
 drivers/net/wireless/intersil/orinoco/orinoco_tmd.c  |  8 --------
 drivers/parport/parport_ip32.c                       | 12 ------------
 drivers/platform/x86/dell/dell_rbu.c                 |  3 ---
 drivers/scsi/53c700.c                                |  1 -
 drivers/scsi/53c700.h                                |  1 -
 drivers/scsi/ch.c                                    |  6 ------
 drivers/scsi/ips.c                                   | 20 --------------------
 drivers/scsi/ips.h                                   | 20 --------------------
 drivers/scsi/lasi700.c                               |  1 -
 drivers/scsi/megaraid/mbox_defs.h                    |  2 --
 drivers/scsi/megaraid/mega_common.h                  |  2 --
 drivers/scsi/megaraid/megaraid_mbox.c                |  2 --
 drivers/scsi/megaraid/megaraid_mbox.h                |  2 --
 drivers/scsi/qla1280.c                               | 12 ------------
 drivers/scsi/sni_53c710.c                            |  1 -
 drivers/video/fbdev/matrox/matroxfb_base.c           |  9 ---------
 drivers/video/fbdev/vga16fb.c                        | 10 ----------
 fs/configfs/configfs_internal.h                      |  4 +---
 fs/configfs/dir.c                                    |  4 +---
 fs/configfs/file.c                                   |  4 +---
 fs/configfs/inode.c                                  |  4 +---
 fs/configfs/item.c                                   |  4 +---
 fs/configfs/mount.c                                  |  4 +---
 fs/configfs/symlink.c                                |  4 +---
 fs/nfs/dir.c                                         |  7 -------
 fs/nfs/nfs4proc.c                                    |  6 ------
 fs/nfs/nfs4renewd.c                                  |  6 ------
 fs/nfs/nfs4state.c                                   |  6 ------
 fs/nfs/nfs4xdr.c                                     |  6 ------
 fs/nfsd/nfs4proc.c                                   |  6 ------
 fs/nfsd/nfs4xdr.c                                    |  6 ------
 fs/nfsd/xdr4.h                                       |  6 ------
 fs/ocfs2/acl.c                                       |  4 +---
 fs/ocfs2/acl.h                                       |  4 +---
 fs/ocfs2/alloc.c                                     |  4 +---
 fs/ocfs2/alloc.h                                     |  4 +---
 fs/ocfs2/aops.c                                      |  4 +---
 fs/ocfs2/aops.h                                      |  4 +---
 fs/ocfs2/blockcheck.c                                |  4 +---
 fs/ocfs2/blockcheck.h                                |  4 +---
 fs/ocfs2/buffer_head_io.c                            |  4 +---
 fs/ocfs2/buffer_head_io.h                            |  4 +---
 fs/ocfs2/cluster/heartbeat.c                         |  4 +---
 fs/ocfs2/cluster/heartbeat.h                         |  4 +---
 fs/ocfs2/cluster/masklog.c                           |  4 +---
 fs/ocfs2/cluster/masklog.h                           |  4 +---
 fs/ocfs2/cluster/netdebug.c                          |  4 +---
 fs/ocfs2/cluster/nodemanager.c                       |  4 +---
 fs/ocfs2/cluster/nodemanager.h                       |  4 +---
 fs/ocfs2/cluster/ocfs2_heartbeat.h                   |  4 +---
 fs/ocfs2/cluster/ocfs2_nodemanager.h                 |  4 +---
 fs/ocfs2/cluster/quorum.c                            |  4 +---
 fs/ocfs2/cluster/quorum.h                            |  4 +---
 fs/ocfs2/cluster/sys.c                               |  4 +---
 fs/ocfs2/cluster/sys.h                               |  4 +---
 fs/ocfs2/cluster/tcp.c                               |  4 +---
 fs/ocfs2/cluster/tcp.h                               |  4 +---
 fs/ocfs2/cluster/tcp_internal.h                      |  4 +---
 fs/ocfs2/dcache.c                                    |  4 +---
 fs/ocfs2/dcache.h                                    |  4 +---
 fs/ocfs2/dir.c                                       |  4 +---
 fs/ocfs2/dir.h                                       |  4 +---
 fs/ocfs2/dlm/dlmapi.h                                |  4 +---
 fs/ocfs2/dlm/dlmast.c                                |  4 +---
 fs/ocfs2/dlm/dlmcommon.h                             |  4 +---
 fs/ocfs2/dlm/dlmconvert.c                            |  4 +---
 fs/ocfs2/dlm/dlmconvert.h                            |  4 +---
 fs/ocfs2/dlm/dlmdebug.c                              |  4 +---
 fs/ocfs2/dlm/dlmdebug.h                              |  4 +---
 fs/ocfs2/dlm/dlmdomain.c                             |  4 +---
 fs/ocfs2/dlm/dlmdomain.h                             |  4 +---
 fs/ocfs2/dlm/dlmlock.c                               |  4 +---
 fs/ocfs2/dlm/dlmmaster.c                             |  4 +---
 fs/ocfs2/dlm/dlmrecovery.c                           |  4 +---
 fs/ocfs2/dlm/dlmthread.c                             |  4 +---
 fs/ocfs2/dlm/dlmunlock.c                             |  4 +---
 fs/ocfs2/dlmfs/dlmfs.c                               |  4 +---
 fs/ocfs2/dlmfs/userdlm.c                             |  4 +---
 fs/ocfs2/dlmfs/userdlm.h                             |  4 +---
 fs/ocfs2/dlmglue.c                                   |  4 +---
 fs/ocfs2/dlmglue.h                                   |  4 +---
 fs/ocfs2/export.c                                    |  4 +---
 fs/ocfs2/export.h                                    |  4 +---
 fs/ocfs2/extent_map.c                                |  4 +---
 fs/ocfs2/extent_map.h                                |  4 +---
 fs/ocfs2/file.c                                      |  4 +---
 fs/ocfs2/file.h                                      |  4 +---
 fs/ocfs2/filecheck.c                                 |  4 +---
 fs/ocfs2/filecheck.h                                 |  4 +---
 fs/ocfs2/heartbeat.c                                 |  4 +---
 fs/ocfs2/heartbeat.h                                 |  4 +---
 fs/ocfs2/inode.c                                     |  4 +---
 fs/ocfs2/inode.h                                     |  4 +---
 fs/ocfs2/journal.c                                   |  4 +---
 fs/ocfs2/journal.h                                   |  4 +---
 fs/ocfs2/localalloc.c                                |  4 +---
 fs/ocfs2/localalloc.h                                |  4 +---
 fs/ocfs2/locks.c                                     |  4 +---
 fs/ocfs2/locks.h                                     |  4 +---
 fs/ocfs2/mmap.c                                      |  4 +---
 fs/ocfs2/move_extents.c                              |  4 +---
 fs/ocfs2/move_extents.h                              |  4 +---
 fs/ocfs2/namei.c                                     |  4 +---
 fs/ocfs2/namei.h                                     |  4 +---
 fs/ocfs2/ocfs1_fs_compat.h                           |  4 +---
 fs/ocfs2/ocfs2.h                                     |  4 +---
 fs/ocfs2/ocfs2_fs.h                                  |  4 +---
 fs/ocfs2/ocfs2_ioctl.h                               |  4 +---
 fs/ocfs2/ocfs2_lockid.h                              |  4 +---
 fs/ocfs2/ocfs2_lockingver.h                          |  4 +---
 fs/ocfs2/refcounttree.c                              |  4 +---
 fs/ocfs2/refcounttree.h                              |  4 +---
 fs/ocfs2/reservations.c                              |  4 +---
 fs/ocfs2/reservations.h                              |  4 +---
 fs/ocfs2/resize.c                                    |  4 +---
 fs/ocfs2/resize.h                                    |  4 +---
 fs/ocfs2/slot_map.c                                  |  4 +---
 fs/ocfs2/slot_map.h                                  |  4 +---
 fs/ocfs2/stack_o2cb.c                                |  4 +---
 fs/ocfs2/stack_user.c                                |  4 +---
 fs/ocfs2/stackglue.c                                 |  4 +---
 fs/ocfs2/stackglue.h                                 |  4 +---
 fs/ocfs2/suballoc.c                                  |  4 +---
 fs/ocfs2/suballoc.h                                  |  4 +---
 fs/ocfs2/super.c                                     |  4 +---
 fs/ocfs2/super.h                                     |  4 +---
 fs/ocfs2/symlink.c                                   |  4 +---
 fs/ocfs2/symlink.h                                   |  4 +---
 fs/ocfs2/sysfile.c                                   |  4 +---
 fs/ocfs2/sysfile.h                                   |  4 +---
 fs/ocfs2/uptodate.c                                  |  4 +---
 fs/ocfs2/uptodate.h                                  |  4 +---
 fs/ocfs2/xattr.c                                     |  4 +---
 fs/ocfs2/xattr.h                                     |  4 +---
 fs/reiserfs/procfs.c                                 | 10 ----------
 include/linux/configfs.h                             |  4 +---
 include/linux/genl_magic_func.h                      |  1 -
 include/linux/genl_magic_struct.h                    |  1 -
 include/uapi/linux/if_bonding.h                      | 11 -----------
 include/uapi/linux/nfs4.h                            |  6 ------
 include/xen/interface/elfnote.h                      | 10 ----------
 include/xen/interface/hvm/hvm_vcpu.h                 | 10 ----------
 include/xen/interface/io/xenbus.h                    | 10 ----------
 samples/configfs/configfs_sample.c                   |  2 --
 tools/usb/hcd-tests.sh                               |  2 --
 157 files changed, 110 insertions(+), 627 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/m68k/atari/time.c b/arch/m68k/atari/time.c
index 1068670cb741..7e44d0e9d0f8 100644
--- a/arch/m68k/atari/time.c
+++ b/arch/m68k/atari/time.c
@@ -317,10 +317,3 @@ int atari_tt_hwclk( int op, struct rtc_time *t )
 
     return( 0 );
 }
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
diff --git a/arch/parisc/include/asm/pdc_chassis.h b/arch/parisc/include/asm/pdc_chassis.h
index ae3e108d22ad..d6d82f53d3d0 100644
--- a/arch/parisc/include/asm/pdc_chassis.h
+++ b/arch/parisc/include/asm/pdc_chassis.h
@@ -365,4 +365,3 @@ void parisc_pdc_chassis_init(void);
 					 PDC_CHASSIS_EOM_SET		)
 
 #endif /* _PARISC_PDC_CHASSIS_H */
-/* vim: set ts=8 */
diff --git a/arch/um/drivers/cow.h b/arch/um/drivers/cow.h
index 103adac691ed..9a67c017000f 100644
--- a/arch/um/drivers/cow.h
+++ b/arch/um/drivers/cow.h
@@ -24,10 +24,3 @@ extern void cow_sizes(int version, __u64 size, int sectorsize, int align,
 		      int *data_offset_out);
 
 #endif
-
-/*
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c
index ff5755ee5694..eba04c0de7eb 100644
--- a/drivers/auxdisplay/panel.c
+++ b/drivers/auxdisplay/panel.c
@@ -1737,10 +1737,3 @@ module_init(panel_init_module);
 module_exit(panel_cleanup_module);
 MODULE_AUTHOR("Willy Tarreau");
 MODULE_LICENSE("GPL");
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c
index 1864467f1063..6754f578fed2 100644
--- a/drivers/gpu/drm/qxl/qxl_drv.c
+++ b/drivers/gpu/drm/qxl/qxl_drv.c
@@ -1,4 +1,3 @@
-/* vim: set ts=8 sw=8 tw=78 ai noexpandtab */
 /* qxl_drv.c -- QXL driver -*- linux-c -*-
  *
  * Copyright 2011 Red Hat, Inc.
diff --git a/drivers/media/usb/pwc/pwc-uncompress.c b/drivers/media/usb/pwc/pwc-uncompress.c
index abfc88391036..68bc3829c6b3 100644
--- a/drivers/media/usb/pwc/pwc-uncompress.c
+++ b/drivers/media/usb/pwc/pwc-uncompress.c
@@ -9,9 +9,6 @@
    Please send bug reports and support requests to <luc@saillard.org>.
    The decompression routines have been implemented by reverse-engineering the
    Nemosoft binary pwcx module. Caveat emptor.
-
-
-   vim: set ts=8:
 */
 
 #include <asm/current.h>
diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c
index 555299737b51..7965e5e3c985 100644
--- a/drivers/net/ethernet/adaptec/starfire.c
+++ b/drivers/net/ethernet/adaptec/starfire.c
@@ -2070,11 +2070,3 @@ static void __exit starfire_cleanup (void)
 
 module_init(starfire_init);
 module_exit(starfire_cleanup);
-
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/net/ethernet/amd/atarilance.c b/drivers/net/ethernet/amd/atarilance.c
index 961796abab35..c1eab916438f 100644
--- a/drivers/net/ethernet/amd/atarilance.c
+++ b/drivers/net/ethernet/amd/atarilance.c
@@ -1156,11 +1156,3 @@ static void __exit atarilance_module_exit(void)
 module_init(atarilance_module_init);
 module_exit(atarilance_module_exit);
 #endif /* MODULE */
-
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 4
- * End:
- */
diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
index aa412506832d..4100ab07e6b7 100644
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@@ -3029,10 +3029,3 @@ static void __exit pcnet32_cleanup_module(void)
 
 module_init(pcnet32_init_module);
 module_exit(pcnet32_cleanup_module);
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c b/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c
index 96a03d10a080..18bd0d9876c2 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c
@@ -312,11 +312,3 @@ static void __exit orinoco_nortel_exit(void)
 
 module_init(orinoco_nortel_init);
 module_exit(orinoco_nortel_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_pci.c b/drivers/net/wireless/intersil/orinoco/orinoco_pci.c
index f3c86b07b1b9..7e3a6dd60c15 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_pci.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_pci.c
@@ -255,11 +255,3 @@ static void __exit orinoco_pci_exit(void)
 
 module_init(orinoco_pci_init);
 module_exit(orinoco_pci_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_plx.c b/drivers/net/wireless/intersil/orinoco/orinoco_plx.c
index 16dada94c774..73e6ae124013 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_plx.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_plx.c
@@ -360,11 +360,3 @@ static void __exit orinoco_plx_exit(void)
 
 module_init(orinoco_plx_init);
 module_exit(orinoco_plx_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c b/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c
index 9a9d335611ac..939d5a1dce97 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c
@@ -235,11 +235,3 @@ static void __exit orinoco_tmd_exit(void)
 
 module_init(orinoco_tmd_init);
 module_exit(orinoco_tmd_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/parport/parport_ip32.c b/drivers/parport/parport_ip32.c
index 48b084e86dc6..0919ed99ba94 100644
--- a/drivers/parport/parport_ip32.c
+++ b/drivers/parport/parport_ip32.c
@@ -2224,15 +2224,3 @@ MODULE_PARM_DESC(features,
 		 ", bit 2: hardware SPP mode"
 		 ", bit 3: hardware EPP mode"
 		 ", bit 4: hardware ECP mode");
-
-/*--- Inform (X)Emacs about preferred coding style ---------------------*/
-/*
- * Local Variables:
- * mode: c
- * c-file-style: "linux"
- * indent-tabs-mode: t
- * tab-width: 8
- * fill-column: 78
- * ispell-local-dictionary: "american"
- * End:
- */
diff --git a/drivers/platform/x86/dell/dell_rbu.c b/drivers/platform/x86/dell/dell_rbu.c
index 03c3ff34bcf5..085ad0a0d22e 100644
--- a/drivers/platform/x86/dell/dell_rbu.c
+++ b/drivers/platform/x86/dell/dell_rbu.c
@@ -675,6 +675,3 @@ static __exit void dcdrbu_exit(void)
 
 module_exit(dcdrbu_exit);
 module_init(dcdrbu_init);
-
-/* vim:noet:ts=8:sw=8
-*/
diff --git a/drivers/scsi/53c700.c b/drivers/scsi/53c700.c
index ab42feab233f..77ccb96e5ed4 100644
--- a/drivers/scsi/53c700.c
+++ b/drivers/scsi/53c700.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* NCR (or Symbios) 53c700 and 53c700-66 Driver
  *
diff --git a/drivers/scsi/53c700.h b/drivers/scsi/53c700.h
index c9f8c497babb..2df347ca91af 100644
--- a/drivers/scsi/53c700.h
+++ b/drivers/scsi/53c700.h
@@ -1,5 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* Driver for 53c700 and 53c700-66 chips from NCR and Symbios
  *
diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c
index cb74ab1ae5a4..9b89c26ccfdb 100644
--- a/drivers/scsi/ch.c
+++ b/drivers/scsi/ch.c
@@ -1058,9 +1058,3 @@ static void __exit exit_ch_module(void)
 
 module_init(init_ch_module);
 module_exit(exit_ch_module);
-
-/*
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 1a3c534826ba..bc33d54a4011 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -7099,23 +7099,3 @@ ips_init_phase2(int index)
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("IBM ServeRAID Adapter Driver " IPS_VER_STRING);
 MODULE_VERSION(IPS_VER_STRING);
-
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 2
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -2
- * c-argdecl-indent: 2
- * c-label-offset: -2
- * c-continued-statement-offset: 2
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h
index 6c0678fb9a67..65edf000e447 100644
--- a/drivers/scsi/ips.h
+++ b/drivers/scsi/ips.h
@@ -1211,23 +1211,3 @@ typedef struct {
       IPS_COMPAT_TAMPA, \
       IPS_COMPAT_KEYWEST \
    }
-
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 2
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -2
- * c-argdecl-indent: 2
- * c-label-offset: -2
- * c-continued-statement-offset: 2
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
diff --git a/drivers/scsi/lasi700.c b/drivers/scsi/lasi700.c
index de71d240a56f..6d14a7a94d0b 100644
--- a/drivers/scsi/lasi700.c
+++ b/drivers/scsi/lasi700.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* PARISC LASI driver for the 53c700 chip
  *
diff --git a/drivers/scsi/megaraid/mbox_defs.h b/drivers/scsi/megaraid/mbox_defs.h
index 01a1bfb8ea2a..f0ef8f7f82c1 100644
--- a/drivers/scsi/megaraid/mbox_defs.h
+++ b/drivers/scsi/megaraid/mbox_defs.h
@@ -781,5 +781,3 @@ typedef struct {
 } __attribute__ ((packed)) mbox_sgl32;
 
 #endif		// _MRAID_MBOX_DEFS_H_
-
-/* vim: set ts=8 sw=8 tw=78: */
diff --git a/drivers/scsi/megaraid/mega_common.h b/drivers/scsi/megaraid/mega_common.h
index 3a7596e47a88..2ad0aa2f837d 100644
--- a/drivers/scsi/megaraid/mega_common.h
+++ b/drivers/scsi/megaraid/mega_common.h
@@ -282,5 +282,3 @@ struct mraid_pci_blk {
 };
 
 #endif // _MEGA_COMMON_H_
-
-// vim: set ts=8 sw=8 tw=78:
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index b1a2d3536add..145fde302d7d 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -4068,5 +4068,3 @@ megaraid_sysfs_show_ldnum(struct device *dev, struct device_attribute *attr, cha
  */
 module_init(megaraid_init);
 module_exit(megaraid_exit);
-
-/* vim: set ts=8 sw=8 tw=78 ai si: */
diff --git a/drivers/scsi/megaraid/megaraid_mbox.h b/drivers/scsi/megaraid/megaraid_mbox.h
index 3e4347c6dab1..d2fe7f69cd5d 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.h
+++ b/drivers/scsi/megaraid/megaraid_mbox.h
@@ -230,5 +230,3 @@ typedef struct {
 #define WROUTDOOR(rdev, value)	writel(value, (rdev)->baseaddr + 0x2C)
 
 #endif // _MEGARAID_H_
-
-// vim: set ts=8 sw=8 tw=78:
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 8f35174a1f9a..928da90b79be 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4403,15 +4403,3 @@ MODULE_FIRMWARE("qlogic/1040.bin");
 MODULE_FIRMWARE("qlogic/1280.bin");
 MODULE_FIRMWARE("qlogic/12160.bin");
 MODULE_VERSION(QLA1280_VERSION);
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * tab-width: 8
- * End:
- */
diff --git a/drivers/scsi/sni_53c710.c b/drivers/scsi/sni_53c710.c
index 97c6f81b1d2a..678651b9b4dd 100644
--- a/drivers/scsi/sni_53c710.c
+++ b/drivers/scsi/sni_53c710.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* SNI RM driver
  *
diff --git a/drivers/video/fbdev/matrox/matroxfb_base.c b/drivers/video/fbdev/matrox/matroxfb_base.c
index a3853421b263..4325bf7f388c 100644
--- a/drivers/video/fbdev/matrox/matroxfb_base.c
+++ b/drivers/video/fbdev/matrox/matroxfb_base.c
@@ -2608,12 +2608,3 @@ EXPORT_SYMBOL(matroxfb_register_driver);
 EXPORT_SYMBOL(matroxfb_unregister_driver);
 EXPORT_SYMBOL(matroxfb_wait_for_sync);
 EXPORT_SYMBOL(matroxfb_enable_irq);
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
-
diff --git a/drivers/video/fbdev/vga16fb.c b/drivers/video/fbdev/vga16fb.c
index 1e8a38a7967d..e2757ff1c23d 100644
--- a/drivers/video/fbdev/vga16fb.c
+++ b/drivers/video/fbdev/vga16fb.c
@@ -1451,13 +1451,3 @@ MODULE_DESCRIPTION("Legacy VGA framebuffer device driver");
 MODULE_LICENSE("GPL");
 module_init(vga16fb_init);
 module_exit(vga16fb_exit);
-
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
-
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 9a3aed249692..c0395363eab9 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset:8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * configfs_internal.h - Internal stuff for configfs
  *
  * Based on sysfs:
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index b6098e02e20b..ac5e0c0e9181 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.c - Operations for configfs directories.
  *
  * Based on sysfs:
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index da8351d1e455..e26060dae70a 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.c - operations for regular (text) files.
  *
  * Based on sysfs:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 42c348bb2903..eb5ec3e46283 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.c - basic inode and dentry operations.
  *
  * Based on sysfs:
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 704a4356f137..254170a82aa3 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * item.c - library routines for handling generic config items
  *
  * Based on kobject:
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 0c6e8cf61953..c2d820063ec4 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * mount.c - operations for initializing and mounting configfs.
  *
  * Based on sysfs:
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 77c854364e60..0623c3edcfb9 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * symlink.c - operations for configfs symlinks.
  *
  * Based on sysfs:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index fc4f490f2d78..3d8e3698d3df 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -3004,10 +3004,3 @@ out_notsup:
 	goto out;
 }
 EXPORT_SYMBOL_GPL(nfs_permission);
-
-/*
- * Local variables:
- *  version-control: t
- *  kept-new-versions: 5
- * End:
- */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c65c4b41e2c1..545010d6cbf3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -10427,9 +10427,3 @@ const struct xattr_handler *nfs4_xattr_handlers[] = {
 #endif
 	NULL
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index ff876dda7f06..db3811af0796 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -149,9 +149,3 @@ void nfs4_set_lease_period(struct nfs_client *clp,
 	/* Cap maximum reconnect timeout at 1/2 lease period */
 	rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1);
 }
-
-/*
- * Local variables:
- *   c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3a51351bdc6a..2eec5bbb55c8 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2695,9 +2695,3 @@ static int nfs4_run_state_manager(void *ptr)
 	module_put_and_exit(0);
 	return 0;
 }
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ac6b79ee9355..d4fd3be0e8ca 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7629,9 +7629,3 @@ const struct rpc_version nfs_version4 = {
 	.procs			= nfs4_procedures,
 	.counts			= nfs_version4_counts,
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index daf43b980d4b..f4ce93d7f26e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -3317,9 +3317,3 @@ const struct svc_version nfsd_version4 = {
 	.vs_rpcb_optnl		= true,
 	.vs_need_cong_ctrl	= true,
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e0f06d3cbd44..7abeccb975b2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5448,9 +5448,3 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
 	nfsd4_sequence_done(resp);
 	return 1;
 }
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index fe540a3415c6..a7c425254fee 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -866,9 +866,3 @@ struct nfsd4_operation {
 
 
 #endif
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 5259badabb56..5c72a7e6d6c5 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * acl.c
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 4e86450917b2..f59d8d0a61fa 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * acl.h
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 78710788c237..e032f2e2c2c5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * alloc.c
  *
  * Extent allocs and frees
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 7f973dd76dbc..4af7abaa6e40 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * alloc.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ad20403b383f..1294925ac94a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 70ed4382750d..3a520117fa59 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index dabfef9c2bc0..863a5316030b 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * blockcheck.c
  *
  * Checksum and ECC codes for the OCFS2 userspace library.
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
index 8f17d2c85f40..d0578e98ee8d 100644
--- a/fs/ocfs2/blockcheck.h
+++ b/fs/ocfs2/blockcheck.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * blockcheck.h
  *
  * Checksum and ECC codes for the OCFS2 userspace library.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f0b104e483d8..e7758778abef 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * io.c
  *
  * Buffer cache handling
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 1c5e533fba04..2d51649fc090 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_buffer_head.h
  *
  * Buffer cache handling functions defined
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 12a7590601dd..e829c2595543 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index beed31ea86cf..1d4100abf6f8 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1d696c96b8b2..810d32815593 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 446e452ac7a6..b73fc42e46ff 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 667a5c5e1f66..7524994e3199 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * netdebug.c
  *
  * debug functionality for o2net
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 7a7640c59f3c..bb82e6b1ff4e 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 3e0006631cc4..3490e77a952d 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * nodemanager.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
index 760d850be11e..6088c9f974dd 100644
--- a/fs/ocfs2/cluster/ocfs2_heartbeat.h
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_heartbeat.h
  *
  * On-disk structures for ocfs2_heartbeat
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 21ad307419a8..c9a0b77443e7 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_nodemanager.h
  *
  * Header describing the interface between userspace and the kernel
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index cea739be77c4..189c111bc371 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
+/*
  *
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
index 6d45ce8b18a1..d64bf4482a4a 100644
--- a/fs/ocfs2/cluster/quorum.h
+++ b/fs/ocfs2/cluster/quorum.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index d6067c3d84c1..022f716c74ff 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sys.c
  *
  * OCFS2 cluster sysfs interface
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
index ce380517cf17..70aaba65317e 100644
--- a/fs/ocfs2/cluster/sys.h
+++ b/fs/ocfs2/cluster/sys.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sys.h
  *
  * Function prototypes for o2cb sysfs interface
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 3bd8119bed5e..f660c0dbdb63 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
+/*
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
  *
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 736338f45c59..a75b551d31c7 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * tcp.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index e6a2b9dfcd16..601c99bd2611 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 42a61eecdacd..04fc8344063a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dcache.c
  *
  * dentry cache handling code
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index 3686a52ba143..7f246c5692d8 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dcache.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index bdfba9db558a..bd8d534f11cb 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.c
  *
  * Creates, reads, walks and deletes directory-nodes
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e3e7d5dd29e8..4b9f5a12c7d2 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index 6456c0fbcbb2..bae60ca2672a 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmapi.h
  *
  * externally exported dlm interfaces
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 70a10764f249..c681ba957932 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmast.c
  *
  * AST and BAST functionality for local and remote nodes
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 58d57e25d384..fd2022712167 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmcommon.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 6051edc33aef..450d46eefab3 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmconvert.c
  *
  * underlying calls for lock conversion
diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h
index 12d9c28bc52f..1f371716513b 100644
--- a/fs/ocfs2/dlm/dlmconvert.h
+++ b/fs/ocfs2/dlm/dlmconvert.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmconvert.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 4b8b41d23e91..d442cf5dda8a 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdebug.c
  *
  * debug functionality for the dlm
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index f8fd8680a4b6..e08f7357e7ec 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdebug.h
  *
  * Copyright (C) 2008 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 357cfc702ce3..9f90fc9551e1 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdomain.c
  *
  * defines domain join / leave apis
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index 7c21664d23d0..815abe30ad09 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdomain.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83f0760e4fba..041fd1791ae7 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmlock.c
  *
  * underlying calls for lock creation
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f105746063ed..4960a6de768d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmmod.c
  *
  * standalone DLM module
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index afc51736686c..0e7aad1b11cc 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmrecovery.c
  *
  * recovery stuff
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 5ccc4ff0b82a..c350bd4df770 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmthread.c
  *
  * standalone DLM module
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index dcb17ca8ae74..61103b2d69fb 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmunlock.c
  *
  * underlying calls for unlocking locks
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b2870f1a31df..fa0a14f199eb 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmfs.c
  *
  * Code which implements the kernel side of a minimal userspace
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 339f098d9592..29f183a15798 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * userdlm.c
  *
  * Code which implements the kernel side of a minimal userspace
diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0558ae768200..47ba18eac423 100644
--- a/fs/ocfs2/dlmfs/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * userdlm.h
  *
  * Userspace dlm defines
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0fbe8bf7190f..48fd369c29a4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmglue.c
  *
  * Code which implements an OCFS2 specific interface to our DLM.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index b8fbed25df89..e5da5809ed95 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmglue.h
  *
  * description here
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 69ed278dd84d..eaa8c80ace3c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * export.c
  *
  * Functions to facilitate NFS exporting
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index d485da0c3439..636357400505 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * export.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 7b93e9c766f6..70a768b623cf 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * extent_map.c
  *
  * Block/Cluster mapping functions
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index e5464f6cee8a..bc4ed59fb925 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * extent_map.h
  *
  * In-memory file extent mappings for OCFS2.
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index db8a6265b749..f17c3d33fb18 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.c
  *
  * File open, close, extend, truncate
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 8536cec5f122..71db8f3aa027 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 50f11bfdc8c2..90b8d300c1ee 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * filecheck.c
  *
  * Code which implements online file check.
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
index 4d006777ac54..d3bcb8bcfeb0 100644
--- a/fs/ocfs2/filecheck.h
+++ b/fs/ocfs2/filecheck.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * filecheck.h
  *
  * Online file check.
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 60c5f995d30c..9099d8fc7599 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.c
  *
  * Register ourselves with the heartbaet service, keep our node maps
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index 5fedb2d35dc0..f1f8b1802fe4 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7c9dfd50c1c1..bc8f32fab964 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.c
  *
  * vfs' aops, fops, dops and iops
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 51a4f7197987..82b28fdacc7e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index db52e843002a..4e589ce2fce6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * journal.c
  *
  * Defines functions of journalling api
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index bfe611ed1b1d..d158acb8b38a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * journal.h
  *
  * Defines journalling api and structures.
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index fc8252a28cb1..5f6bacbeef6b 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * localalloc.c
  *
  * Node local data allocation
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index e8a5cea48639..08f925b7ec6d 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * localalloc.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 7edc4e5c7c2c..fab7c6a4a7d0 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * locks.c
  *
  * Userspace file locking support
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 389fe1fce3a5..b52de3947d5f 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * locks.h
  *
  * Function prototypes for Userspace file locking support
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 25cabbfe87fc..1834f26522ed 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * mmap.c
  *
  * Code to deal with the mess that is clustered mmap.
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 758d9661ef1e..192cad0662d8 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * move_extents.c
  *
  * Copyright (C) 2011 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
index 28cac43892c5..987f9e559f30 100644
--- a/fs/ocfs2/move_extents.h
+++ b/fs/ocfs2/move_extents.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * move_extents.h
  *
  * Copyright (C) 2011 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 05ced86580d1..2c46ff6ba4ea 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * namei.c
  *
  * Create and rename file, directory, symlinks
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index cc091ed02b4a..9cc891eb874e 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * namei.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
index 01ae48c4834d..6dbcf3d467fb 100644
--- a/fs/ocfs2/ocfs1_fs_compat.h
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs1_fs_compat.h
  *
  * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7993d527edae..bb62cc2e0211 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2.h
  *
  * Defines macros and structures used in OCFS2
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 19137c6d087b..638d875eccc7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_fs.h
  *
  * On-disk structures for OCFS2.
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 273616bd4f19..9680797bc531 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_ioctl.h
  *
  * Defines OCFS2 ioctls.
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index b4be84956bc1..8ac357ce6a30 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_lockid.h
  *
  * Defines OCFS2 lockid bits.
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 5c9c105b33ee..31a5e1619e7f 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_lockingver.h
  *
  * Defines OCFS2 Locking version values.
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index c19a463fac55..7f6355cbb587 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * refcounttree.c
  *
  * Copyright (C) 2009 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 0b9014495726..8197a94feec0 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * refcounttree.h
  *
  * Copyright (C) 2009 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index bf3842e34fb9..769e466887b0 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * reservations.c
  *
  * Allocation reservations implementation
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 6ac88122896d..677c50663595 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * reservations.h
  *
  * Allocation reservations function prototypes and structures.
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 24eb52f9059c..d65d43c61857 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * resize.c
  *
  * volume resize.
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
index 0af0c023042c..4990637219ef 100644
--- a/fs/ocfs2/resize.h
+++ b/fs/ocfs2/resize.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * resize.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 4da0e4b1e79b..0b0ae3ebb0cf 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * slot_map.c
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 93b53e73f0f7..a43644570b53 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * slotmap.h
  *
  * description here
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index f70012038383..88f75f7f02d7 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stack_o2cb.c
  *
  * Code which interfaces ocfs2 with the o2cb stack.
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 7397064c3f35..85a47621e0c0 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stack_user.c
  *
  * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 8d33ebc6b6fc..d50e8b8dfea4 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stackglue.c
  *
  * Code which implements an OCFS2 specific interface to underlying
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index e9d26cbeb3b8..3636847fae19 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stackglue.h
  *
  * Glue to the underlying cluster stack.
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8c8cf7f4eb34..8521942f5af2 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * suballoc.c
  *
  * metadata alloc and free
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 50b36250beb6..5805a03d100b 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * suballoc.h
  *
  * Defines sub allocator api
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 079f8826993e..c86bd4e60e20 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * super.c
  *
  * load/unload driver, mount/dismount volumes
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 76facaf63336..8312651135b9 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * super.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 94cfacc9bad7..f755a4985821 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  *  linux/cluster/ssi/cfs/symlink.c
  *
  *	This program is free software; you can redistribute it and/or
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
index 167094d1e5aa..ffcf0210545c 100644
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * symlink.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bb701c4e449f..53a945da873b 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sysfile.c
  *
  * Initialize, read, write, etc. system files.
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
index a83dd962fccb..2b38c75990fd 100644
--- a/fs/ocfs2/sysfile.h
+++ b/fs/ocfs2/sysfile.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sysfile.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 580852ba05c4..09854925fa5c 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * uptodate.c
  *
  * Tracking the up-to-date-ness of a local buffer_head with respect to
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 77a30cae4879..85d94134001b 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * uptodate.h
  *
  * Cluster uptodate tracking
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 36ae47a4aef6..dd784eb0cd7c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * xattr.c
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 9c80382da1f5..00308b57f64f 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * xattr.h
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 155b82870333..4a7cb16e9345 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -488,13 +488,3 @@ int reiserfs_proc_info_global_done(void)
  * (available at http://www.namesys.com/legalese.html)
  *
  */
-
-/*
- * Make Linus happy.
- * Local variables:
- * c-indentation-style: "K&R"
- * mode-name: "LC"
- * c-basic-offset: 8
- * tab-width: 8
- * End:
- */
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 2e8c69b43c64..97cfd13bae51 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * configfs.h - definitions for the device driver filesystem
  *
  * Based on sysfs:
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 6cb82301d8e9..939b1a8f571b 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -404,4 +404,3 @@ s_fields								\
 
 /* }}}1 */
 #endif /* GENL_MAGIC_FUNC_H */
-/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */
diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h
index 35d21fddaf2d..f81d48987528 100644
--- a/include/linux/genl_magic_struct.h
+++ b/include/linux/genl_magic_struct.h
@@ -283,4 +283,3 @@ enum {									\
 
 /* }}}1 */
 #endif /* GENL_MAGIC_STRUCT_H */
-/* vim: set foldmethod=marker nofoldenable : */
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index e8eb4ad03cf1..d174914a837d 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -153,14 +153,3 @@ enum {
 #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1)
 
 #endif /* _LINUX_IF_BONDING_H */
-
-/*
- * Local variables:
- *  version-control: t
- *  kept-new-versions: 5
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
-
diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h
index ed5415e0f1c1..800bb0ffa6e6 100644
--- a/include/uapi/linux/nfs4.h
+++ b/include/uapi/linux/nfs4.h
@@ -178,9 +178,3 @@
 #define NFS4_MAX_BACK_CHANNEL_OPS 2
 
 #endif /* _UAPI_LINUX_NFS4_H */
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h
index 9e9f9bf7c66d..449bd383cb76 100644
--- a/include/xen/interface/elfnote.h
+++ b/include/xen/interface/elfnote.h
@@ -208,13 +208,3 @@
 #define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_ENTRY
 
 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/include/xen/interface/hvm/hvm_vcpu.h b/include/xen/interface/hvm/hvm_vcpu.h
index 32ca83edd44d..bfc2138e0bf5 100644
--- a/include/xen/interface/hvm/hvm_vcpu.h
+++ b/include/xen/interface/hvm/hvm_vcpu.h
@@ -131,13 +131,3 @@ struct vcpu_hvm_context {
 typedef struct vcpu_hvm_context vcpu_hvm_context_t;
 
 #endif /* __XEN_PUBLIC_HVM_HVM_VCPU_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h
index aaf2951b1cce..fb8716112251 100644
--- a/include/xen/interface/io/xenbus.h
+++ b/include/xen/interface/io/xenbus.h
@@ -39,13 +39,3 @@ enum xenbus_state
 };
 
 #endif /* _XEN_PUBLIC_IO_XENBUS_H */
-
-/*
- * Local variables:
- *  c-file-style: "linux"
- *  indent-tabs-mode: t
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c
index f9008be7a8a1..37a657b25d58 100644
--- a/samples/configfs/configfs_sample.c
+++ b/samples/configfs/configfs_sample.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * vim: noexpandtab ts=8 sts=0 sw=8:
- *
  * configfs_example_macros.c - This file is a demonstration module
  *      containing a number of configfs subsystems.  It uses the helper
  *      macros defined by configfs.h
diff --git a/tools/usb/hcd-tests.sh b/tools/usb/hcd-tests.sh
index e8cad6a4f9c9..73f914d13f5c 100644
--- a/tools/usb/hcd-tests.sh
+++ b/tools/usb/hcd-tests.sh
@@ -272,5 +272,3 @@ do
 	echo ''
     done
 done
-
-# vim: sw=4
-- 
cgit v1.2.3-71-gd317